18import matplotlib.pyplot
as plt
19import matplotlib.artist
20import matplotlib.figure
21import matplotlib.gridspec
22import matplotlib.colors
23import matplotlib.patches
24import matplotlib.ticker
25import matplotlib.patheffects
as PathEffects
28from basf2_mva_evaluation
import histogram
40plt.style.use(
"belle2")
45 Base class for all Plotters.
77 def __init__(self, figure=None, axis=None, dpi=None):
79 Creates a new figure and axis if None is given, sets the default plot parameters
80 @param figure default draw figure which is used
81 @param axis default draw axis which is used
82 @param dpi dpi for the matplotlib figure, if None default is used
84 b2.B2INFO(
"Create new figure for class " + str(type(self)))
89 self.
figure = matplotlib.figure.Figure(figsize=(12, 8), dpi=dpi)
104 self.
xmin, self.
xmax = float(0), float(1)
106 self.
ymin, self.
ymax = float(0), float(1)
127 self.
prop_cycler = itertools.cycle(plt.rcParams[
"axes.prop_cycle"])
131 Adds a new subplot to the figure, updates all other axes
132 according to the given gridspec
133 @param gridspecs gridspecs for all axes including the new one
135 for gs, ax
in zip(gridspecs[:-1], self.
figure.axes):
136 ax.set_position(gs.get_position(self.
figure))
137 ax.set_subplotspec(gs)
143 Save the figure into a file
144 @param filename of the file
146 b2.B2INFO(
"Save figure for class " + str(type(self)))
147 from matplotlib.backends.backend_agg
import FigureCanvasAgg
as FigureCanvas
148 canvas = FigureCanvas(self.
figure)
149 canvas.print_figure(filename, dpi=self.
dpi, bbox_inches=
'tight')
154 Overrides default plot options for datapoint plot
155 @param plot_kwargs keyword arguments for the plot function
162 Overrides default errorbar options for datapoint errorbars
163 @param errorbar_kwargs keyword arguments for the errorbar function
170 Overrides default errorband options for datapoint errorband
171 @param errorbar_kwargs keyword arguments for the fill_between function
178 Overrides default fill_between options for datapoint errorband
179 @param fill_kwargs keyword arguments for the fill_between function
186 Plot the given datapoints, with plot, errorbar and make a errorband with fill_between
187 @param x coordinates of the data points
188 @param y coordinates of the data points
189 @param xerr symmetric error on x data points
190 @param yerr symmetric error on y data points
198 if plot_kwargs
is None or 'color' not in plot_kwargs:
200 color = color[
'color']
201 plot_kwargs[
'color'] = color
203 color = plot_kwargs[
'color']
204 color = matplotlib.colors.ColorConverter().to_rgb(color)
205 patch = matplotlib.patches.Patch(color=color, alpha=0.5)
206 patch.get_color = patch.get_facecolor
209 if plot_kwargs
is not None:
210 p, = axis.plot(x, y, rasterized=
True, **plot_kwargs)
213 if errorbar_kwargs
is not None and (xerr
is not None or yerr
is not None):
214 if 'color' not in errorbar_kwargs:
215 errorbar_kwargs[
'color'] = color
216 if 'ecolor' not in errorbar_kwargs:
217 errorbar_kwargs[
'ecolor'] = [0.5 * x
for x
in color]
222 if not isinstance(xerr, (numpy.ndarray, list)):
223 xerr = xerr*numpy.ones(len(x))
224 if not isinstance(yerr, (numpy.ndarray, list)):
225 yerr = yerr*numpy.ones(len(y))
226 mask = numpy.logical_and.reduce([numpy.isfinite(v)
for v
in [x, y, xerr, yerr]])
229 x[mask], y[mask], xerr=numpy.where(
230 xerr[mask] < 0, 0.0, xerr[mask]), yerr=numpy.where(
231 yerr[mask] < 0, 0.0, yerr[mask]), rasterized=
True, **errorbar_kwargs)
234 if errorband_kwargs
is not None and yerr
is not None:
235 if 'color' not in errorband_kwargs:
236 errorband_kwargs[
'color'] = color
241 for _x, _y, _xe, _ye
in zip(x, y, xerr, yerr):
242 axis.add_patch(matplotlib.patches.Rectangle((_x - _xe, _y - _ye), 2 * _xe, 2 * _ye, rasterized=
True,
245 f = axis.fill_between(x, y - yerr, y + yerr, interpolate=
True, rasterized=
True, **errorband_kwargs)
247 if fill_kwargs
is not None:
249 x = numpy.append(x, x[-1]+2*xerr[-1])
250 y = numpy.append(y, y[-1])
251 xerr = numpy.append(xerr, xerr[-1])
253 axis.fill_between(x-xerr, y, 0, rasterized=
True, **fill_kwargs)
255 return (tuple(patches), p, e, f)
257 def add(self, *args, **kwargs):
259 Add a new plot to this plotter
261 return NotImplemented
265 Sets the limits of the axis with an optional expansion factor.
268 factor (float): Fraction by which to expand the axis limits beyond the data range.
272 self.
axis.set_xlim((self.
xmin - factor*dx, self.
xmax + factor*dx))
273 self.
axis.set_ylim((self.
ymin - factor*dy, self.
ymax + factor*dy))
277 Finish plotting and set labels, legends and stuff
279 return NotImplemented
283 Scale limits to increase distance to boundaries
294 Plots the purity and the efficiency over the cut value (for cut choosing)
301 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True):
303 Add a new curve to the plot
304 @param data pandas.DataFrame containing all data
305 @param column which is used to calculate efficiency and purity for different cuts
306 @param signal_mask boolean numpy.array defining which events are signal events
307 @param bckgrd_mask boolean numpy.array defining which events are background events
308 @param weight_column column in data containing the weights for each event
309 @param normed boolean if True, the efficiency and purity are normalized to 1
312 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
315 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
316 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
318 efficiency, efficiency_error = hists.get_true_positives([
'Signal'])
319 purity, purity_error = hists.get_false_positives([
'Background'])
321 if isinstance(efficiency, int)
and not isinstance(purity, int):
322 efficiency = numpy.array([efficiency] * len(purity))
323 elif isinstance(purity, int)
and not isinstance(efficiency, int):
324 purity = numpy.array([purity] * len(efficiency))
325 elif isinstance(purity, int)
and isinstance(efficiency, int):
326 efficiency = numpy.array([efficiency])
327 purity = numpy.array([purity])
328 cuts = hists.bin_centers
330 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(cuts, self.
xmin)), numpy.nanmax(numpy.append(cuts, self.
xmax))
333 (efficiency, purity, [
334 self.
ymin]))), numpy.nanmax(
336 (efficiency, purity, [
343 self.
labels.append(
"Efficiency")
345 self.
labels.append(
"True positive")
351 self.
labels.append(
"Purity")
353 self.
labels.append(
"False positive")
355 self.
axis.set_title(
"Classification Plot")
361 Sets limits, title, axis-labels and legend of the plot
364 self.
axis.get_xaxis().set_label_text(
'Cut Value')
365 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
371 Plots the signal to noise ratio over the cut value (for cut choosing)
378 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
380 Add a new curve to the plot
381 @param data pandas.DataFrame containing all data
382 @param column which is used to calculate signal to noise ratio for different cuts
383 @param signal_mask boolean numpy.array defining which events are signal events
384 @param bckgrd_mask boolean numpy.array defining which events are background events
385 @param weight_column column in data containing the weights for each event
386 @param label label for the plot legend
388 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
389 signal2noise, signal2noise_error = hists.get_signal_to_noise([
'Signal'], [
'Background'])
390 cuts = hists.bin_centers
392 valid = numpy.isfinite(signal2noise)
393 signal2noise = signal2noise[valid]
394 signal2noise_error = signal2noise_error[valid]
398 if len(signal2noise) == 0
or numpy.all(numpy.isnan(signal2noise)):
401 best_idx = numpy.nanargmax(signal2noise)
402 best_cut = cuts[best_idx]
403 best_signal2noise = signal2noise[best_idx]
405 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(cuts, self.
xmin)), numpy.nanmax(numpy.append(cuts, self.
xmax))
408 signal2noise, self.
ymin)), numpy.nanmax(
410 signal2noise, self.
ymax))
417 if best_idx
is not None:
418 self.
axis.
plot(best_cut, best_signal2noise,
'x', color=p[1].get_color(), markersize=8, label=
'Best cut')
419 self.
axis.axvline(best_cut, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
420 self.
axis.axhline(best_signal2noise, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
423 cut_label = f
"{label[:10] if label else column[:10]} (Best cut: {best_cut:.3f}, S/N: {best_signal2noise:.2f})"
424 self.
labels.append(cut_label)
429 Sets limits, title, axis-labels and legend of the plot
432 self.
axis.set_title(
"Signal to Noise Plot")
433 self.
axis.get_xaxis().set_label_text(
'Cut Value')
434 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
440 Plots the purity over the efficiency also known as ROC curve
447 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
449 Add a new curve to the ROC plot
450 @param data pandas.DataFrame containing all data
451 @param column which is used to calculate efficiency and purity for different cuts
452 @param signal_mask boolean numpy.array defining which events are signal events
453 @param bckgrd_mask boolean numpy.array defining which events are background events
454 @param weight_column column in data containing the weights for each event
455 @param label label for the plot legend
457 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
458 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
459 purity, purity_error = hists.get_purity([
'Signal'], [
'Background'])
460 if isinstance(efficiency, int)
and not isinstance(purity, int):
461 efficiency = numpy.array([efficiency] * len(purity))
462 elif isinstance(purity, int)
and not isinstance(efficiency, int):
463 purity = numpy.array([purity] * len(efficiency))
464 elif isinstance(purity, int)
and isinstance(efficiency, int):
465 efficiency = numpy.array([efficiency])
466 purity = numpy.array([purity])
467 cuts = hists.bin_centers
469 valid = numpy.isfinite(purity) & numpy.isfinite(efficiency)
470 efficiency = efficiency[valid]
471 purity = purity[valid]
473 if not isinstance(efficiency_error, int):
474 efficiency_error = efficiency_error[valid]
475 if not isinstance(purity_error, int):
476 purity_error = purity_error[valid]
479 distance = numpy.sqrt(numpy.square(1 - purity) + numpy.square(1 - efficiency))
480 if len(distance) == 0
or numpy.all(numpy.isnan(distance)):
483 best_idx = numpy.nanargmin(distance)
484 best_cut = cuts[best_idx]
485 best_efficiency = efficiency[best_idx]
486 best_purity = purity[best_idx]
488 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(efficiency, self.
xmin)), numpy.nanmax(numpy.append(efficiency, self.
xmax))
489 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(purity, self.
ymin)), numpy.nanmax(numpy.append(purity, self.
ymax))
492 p = self.
_plot_datapoints(self.
axis, efficiency, purity, xerr=efficiency_error, yerr=purity_error)
495 if best_idx
is not None:
497 self.
axis.
plot(best_efficiency, best_purity,
'x', color=p[1].get_color(), markersize=8, label=
'Best cut')
498 self.
axis.axhline(best_purity, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
499 self.
axis.axvline(best_efficiency, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
502 cut_label = f
"{label[:10] if label else column[:10]} (Best cut: {best_cut:.3f})"
503 self.
labels.append(cut_label)
508 Sets limits, title, axis-labels and legend of the plot
511 self.
axis.set_title(
"ROC Purity Plot")
512 self.
axis.get_xaxis().set_label_text(
'Efficiency')
513 self.
axis.get_yaxis().set_label_text(
'Purity')
514 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
520 Plots the rejection over the efficiency also known as ROC curve
527 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
529 Add a new curve to the ROC plot
530 @param data pandas.DataFrame containing all data
531 @param column which is used to calculate efficiency and purity for different cuts
532 @param signal_mask boolean numpy.array defining which events are signal events
533 @param bckgrd_mask boolean numpy.array defining which events are background events
534 @param weight_column column in data containing the weights for each event
535 @param label label for the plot legend
537 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
538 efficiency, efficiency_error = hists.get_efficiency([
'Signal'])
539 rejection, rejection_error = hists.get_efficiency([
'Background'])
540 rejection = 1 - rejection
541 if isinstance(efficiency, int)
and not isinstance(rejection, int):
542 efficiency = numpy.array([efficiency] * len(rejection))
543 elif isinstance(rejection, int)
and not isinstance(efficiency, int):
544 rejection = numpy.array([rejection] * len(efficiency))
545 elif isinstance(rejection, int)
and isinstance(efficiency, int):
546 efficiency = numpy.array([efficiency])
547 rejection = numpy.array([rejection])
548 cuts = hists.bin_centers
550 valid = numpy.isfinite(rejection) & numpy.isfinite(efficiency)
551 efficiency = efficiency[valid]
552 rejection = rejection[valid]
554 if not isinstance(efficiency_error, int):
555 efficiency_error = efficiency_error[valid]
556 if not isinstance(rejection_error, int):
557 rejection_error = rejection_error[valid]
560 distance = numpy.sqrt(numpy.square(1 - rejection) + numpy.square(1 - efficiency))
561 if len(distance) == 0
or numpy.all(numpy.isnan(distance)):
564 best_idx = numpy.nanargmin(distance)
565 best_cut = cuts[best_idx]
566 best_rejection = rejection[best_idx]
567 best_efficiency = efficiency[best_idx]
569 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(efficiency, self.
xmin)), numpy.nanmax(numpy.append(efficiency, self.
xmax))
570 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(rejection, self.
ymin)), numpy.nanmax(numpy.append(rejection, self.
ymax))
572 auc = numpy.abs(numpy.trapz(rejection, efficiency))
575 p = self.
_plot_datapoints(self.
axis, efficiency, rejection, xerr=efficiency_error, yerr=rejection_error)
578 if best_idx
is not None:
580 self.
axis.
plot(best_efficiency, best_rejection,
'x', color=p[1].get_color(), markersize=8, label=
'Best cut')
581 self.
axis.axhline(best_rejection, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
582 self.
axis.axvline(best_efficiency, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
585 cut_label = f
"{label[:10] if label else column[:10]} (AUC: {auc:.2f}, Best cut: {best_cut:.3f})"
586 self.
labels.append(cut_label)
591 Sets limits, title, axis-labels and legend of the plot
594 self.
axis.set_title(
"ROC Rejection Plot")
595 self.
axis.get_yaxis().set_label_text(
'Background Rejection')
596 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
598 self.
axis.get_xaxis().set_label_text(
'Signal Efficiency')
604 Plots the true ROC curve: True Positive Rate (TPR) vs False Positive Rate (FPR),
605 and marks the cut that gives the point closest to the ideal (0,1).
612 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
614 Add a new curve to the ROC plot
615 @param data pandas.DataFrame containing all data
616 @param column which is used to calculate efficiency and purity for different cuts
617 @param signal_mask boolean numpy.array defining which events are signal events
618 @param bckgrd_mask boolean numpy.array defining which events are background events
619 @param weight_column column in data containing the weights for each event
620 @param label label for the plot legend
623 weight_column=weight_column)
625 tpr, tpr_error = hists.get_efficiency([
'Signal'])
626 fpr, fpr_error = hists.get_efficiency([
'Background'])
627 if isinstance(tpr, int)
and not isinstance(fpr, int):
628 tpr = numpy.array([tpr] * len(fpr))
629 elif isinstance(fpr, int)
and not isinstance(tpr, int):
630 fpr = numpy.array([fpr] * len(tpr))
631 elif isinstance(fpr, int)
and isinstance(tpr, int):
632 tpr = numpy.array([tpr])
633 fpr = numpy.array([fpr])
634 cuts = hists.bin_centers
636 valid = numpy.isfinite(tpr) & numpy.isfinite(fpr)
640 if not isinstance(tpr_error, int):
641 tpr_error = tpr_error[valid]
642 if not isinstance(fpr_error, int):
643 fpr_error = fpr_error[valid]
646 distance = numpy.sqrt(numpy.square(fpr) + numpy.square(1 - tpr))
647 if len(distance) == 0
or numpy.all(numpy.isnan(distance)):
650 best_idx = numpy.nanargmin(distance)
651 best_cut = cuts[best_idx]
652 best_tpr = tpr[best_idx]
653 best_fpr = fpr[best_idx]
656 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(fpr, self.
xmin)), numpy.nanmax(numpy.append(fpr, self.
xmax))
657 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(tpr, self.
ymin)), numpy.nanmax(numpy.append(tpr, self.
ymax))
659 auc = numpy.abs(numpy.trapz(tpr, fpr))
665 if best_idx
is not None:
667 self.
axis.
plot(best_fpr, best_tpr,
'x', color=p[1].get_color(), markersize=8)
668 self.
axis.axhline(best_tpr, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
669 self.
axis.axvline(best_fpr, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
672 cut_label = f
"{label[:10] if label else column[:10]} (AUC: {auc:.2f}, Cut: {best_cut:.3f})"
673 self.
labels.append(cut_label)
678 Sets limits, title, axis-labels and legend of the plot
681 self.
axis.set_title(
"True ROC Curve")
682 self.
axis.get_xaxis().set_label_text(
'False Positive Rate (Background Efficiency)')
683 self.
axis.get_yaxis().set_label_text(
'True Positive Rate (Signal Efficiency)')
684 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
690 Plots the Precision vs Recall curve and marks the cut that gives the point closest to the ideal (1,1).
697 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
699 Add a new curve to the Precision-Recall plot
700 @param data pandas.DataFrame containing all data
701 @param column which is used to calculate efficiency and purity for different cuts
702 @param signal_mask boolean numpy.array defining which events are signal events
703 @param bckgrd_mask boolean numpy.array defining which events are background events
704 @param weight_column column in data containing the weights for each event
705 @param label label for the plot legend
708 weight_column=weight_column)
710 recall, recall_error = hists.get_efficiency([
'Signal'])
711 precision, precision_error = hists.get_purity([
'Signal'], [
'Background'])
712 if isinstance(recall, int)
and not isinstance(precision, int):
713 recall = numpy.array([recall] * len(precision))
714 elif isinstance(precision, int)
and not isinstance(recall, int):
715 precision = numpy.array([precision] * len(recall))
716 elif isinstance(precision, int)
and isinstance(recall, int):
717 recall = numpy.array([recall])
718 precision = numpy.array([precision])
719 cuts = hists.bin_centers
721 valid = numpy.isfinite(precision) & numpy.isfinite(recall)
722 precision = precision[valid]
723 recall = recall[valid]
725 if not isinstance(recall_error, int):
726 recall_error = recall_error[valid]
727 if not isinstance(precision_error, int):
728 precision_error = precision_error[valid]
731 distance = numpy.sqrt(numpy.square(1 - precision) + numpy.square(1 - recall))
732 if len(distance) == 0
or numpy.all(numpy.isnan(distance)):
735 best_idx = numpy.nanargmin(distance)
736 best_cut = cuts[best_idx]
737 best_recall = recall[best_idx]
738 best_precision = precision[best_idx]
741 self.
xmin, self.
xmax = numpy.nanmin(numpy.append(recall, self.
xmin)), numpy.nanmax(numpy.append(recall, self.
xmax))
742 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(precision, self.
ymin)), numpy.nanmax(numpy.append(precision, self.
ymax))
744 auc = numpy.abs(numpy.trapz(precision, recall))
747 p = self.
_plot_datapoints(self.
axis, recall, precision, xerr=recall_error, yerr=precision_error)
750 if best_idx
is not None:
752 self.
axis.
plot(best_recall, best_precision,
'x', color=p[1].get_color(), markersize=8, label=
'Best cut')
753 self.
axis.axhline(best_precision, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
754 self.
axis.axvline(best_recall, color=p[1].get_color(), linestyle=
'dashed', linewidth=1)
757 cut_label = f
"{label[:10] if label else column[:10]} (AUC: {auc:.2f}, Cut: {best_cut:.3f})"
758 self.
labels.append(cut_label)
763 Sets limits, title, axis-labels and legend of the plot
766 self.
axis.set_title(
"Precision-Recall Curve")
767 self.
axis.get_xaxis().set_label_text(
'Recall (Signal Efficiency)')
768 self.
axis.get_yaxis().set_label_text(
'Precision (Purity)')
769 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
775 Plots multiple other plots into a grid 3x?
782 def __init__(self, cls, number_of_plots, figure=None, dpi=None):
784 Creates a new figure if None is given, sets the default plot parameters
785 @param cls class of the plot
786 @param number_of_plots number of plots which should be displayed
787 @param figure default draw figure which is used
788 @param dpi dpi for the matplotlib figure, if None default is used
790 if number_of_plots == 1:
792 elif number_of_plots == 2:
794 elif number_of_plots == 3:
796 elif number_of_plots == 4:
798 elif number_of_plots == 6:
801 gsTuple = (int(numpy.ceil(number_of_plots / 3)), 3)
807 self.
figure = matplotlib.figure.Figure(figsize=(12*gsTuple[1], 8*gsTuple[0]), dpi=dpi)
811 gs = matplotlib.gridspec.GridSpec(gsTuple[0], gsTuple[1])
813 grid_list = list(itertools.product(range(gs.nrows), range(gs.ncols)))
816 for i
in range(number_of_plots)]
821 def add(self, i, *args, **kwargs):
823 Call add function of ith subplot
824 @param i position of the subplot
830 Sets limits, title, axis-labels and legend of the plot
839 Plots the purity in each bin over the classifier output.
846 def add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None):
848 Add a new curve to the Diagonal plot
849 @param data pandas.DataFrame containing all data
850 @param column which is used to calculate purity for different cuts
851 @param signal_mask boolean numpy.array defining which events are signal events
852 @param bckgrd_mask boolean numpy.array defining which events are background events
853 @param weight_column column in data containing the weights for each event
854 @param label label for the plot legend
856 hists =
histogram.Histograms(data, column, {
'Signal': signal_mask,
'Background': bckgrd_mask}, weight_column=weight_column)
857 purity, purity_error = hists.get_purity_per_bin([
'Signal'], [
'Background'])
861 hists.bin_centers, self.
xmin)), numpy.nanmax(
863 hists.bin_centers, self.
xmax))
864 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(purity, self.
ymin)), numpy.nanmax(numpy.append(purity, self.
ymax))
867 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, purity, xerr=hists.bin_widths / 2.0, yerr=purity_error)
870 self.
labels.append(column)
877 Sets limits, title, axis-labels and legend of the plot
880 self.
axis.
plot((0.0, 1.0), (0.0, 1.0), color=
'black')
882 self.
axis.set_title(
"Diagonal Plot")
883 self.
axis.get_xaxis().set_label_text(
'Classifier Output')
884 self.
axis.get_yaxis().set_label_text(
'Purity Per Bin')
885 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
891 Plots distribution of a quantity
894 def __init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False,
895 keep_first_binning=False, range_in_std=None):
897 Creates a new figure and axis if None is given, sets the default plot parameters
898 @param figure default draw figure which is used
899 @param axis default draw axis which is used
900 @param normed true if histograms should be normed before drawing
901 @param keep_first_binning use the binning of the first distribution for further plots
902 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
927 def add(self, data, column, mask=None, weight_column=None, label=None):
929 Add a new distribution to the plots
930 @param data pandas.DataFrame containing all data
931 @param column which is used to calculate distribution histogram
932 @param mask boolean numpy.array defining which events are used for the histogram
933 @param weight_column column in data containing the weights for each event
934 @param label label for the plot legend
937 mask = numpy.ones(len(data)).astype(
'bool')
943 bins=bins, equal_frequency=
False, range_in_std=self.
range_in_std)
946 hist, hist_error = hists.get_hist(
'Total')
949 normalization = float(numpy.sum(hist))
950 hist = hist / normalization
if normalization > 0
else hist
951 hist_error = hist_error / normalization
if normalization > 0
else hist_error
954 hist = hist / hists.bin_widths
if normalization > 0
else hist
955 hist_error = hist_error / hists.bin_widths
if normalization > 0
else hist_error
957 self.
xmin, self.
xmax = numpy.nanmin(
959 hists.bin_centers, self.
xmin)), numpy.nanmax(
961 hists.bin_centers, self.
xmax))
962 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(hist, self.
ymin)), numpy.nanmax(numpy.append(hist + hist_error, self.
ymax))
965 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, hist, xerr=hists.bin_widths / 2, yerr=hist_error)
971 appendix =
' No data to plot!'
974 self.
labels.append(column + appendix)
976 self.
labels.append(label + appendix)
981 Sets limits, title, axis-labels and legend of the plot
983 self.
axis.set_title(
"Distribution Plot")
986 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
989 self.
axis.set_xlim((0., 1.))
990 self.
axis.set_ylim((0., 1.))
991 self.
axis.text(0.36, 0.5,
'No data to plot', fontsize=60, color=
'black')
998 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / (# Entries * Bin Width)')
1000 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / # Entries')
1002 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin / Bin Width')
1004 self.
axis.get_yaxis().set_label_text(
'# Entries per Bin')
1016 def __init__(self, figure=None, axis=None, x_axis_label=None):
1018 Creates a new figure and axis if None is given, sets the default plot parameters
1019 @param figure default draw figure which is used
1020 @param axis default draw axis which is used
1022 super().
__init__(figure=figure, axis=axis)
1027 def add(self, data, column, mask=None, weight_column=None):
1029 Add a new boxplot to the plots
1030 @param data pandas.DataFrame containing all data
1031 @param column which is used to calculate boxplot quantities
1032 @param mask boolean numpy.array defining which events are used for the histogram
1033 @param weight_column column in data containing the weights for each event
1036 mask = numpy.ones(len(data)).astype(
'bool')
1037 x = data[column][mask]
1038 if weight_column
is not None:
1040 b2.B2WARNING(
"Weights are currently not used in boxplot, due to limitations in matplotlib")
1043 b2.B2WARNING(
"Ignore empty boxplot.")
1047 p = self.
axis.boxplot(x, sym=
'k.', whis=1.5, vert=
False, patch_artist=
True, showmeans=
True, widths=1,
1048 boxprops=dict(facecolor=
'blue', alpha=0.5), showfliers=
False,
1052 self.
plots.append(p)
1053 self.
labels.append(column)
1057 self.axis.text(0.1, 0.9, (r'$ \mu = {:.2f}$' + '\n' + r'$median = {:.2f}$').format(x.mean(), x.median()),
1058 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
1059 self.axis.text(0.4, 0.9, (r'$ \sigma = {:.2f}$' + '\n' + r'$IQD = {:.2f}$').format(x.std(),
1060 x.quantile(0.75) - x.quantile(0.25)),
1061 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
1062 self.axis.text(0.7, 0.9, (r'$min = {:.2f}$' + '\n' + r'$max = {:.2f}$').format(x.min(), x.max()),
1063 fontsize=28, verticalalignment='top', horizontalalignment='left', transform=self.axis.transAxes)
1070 Sets limits, title, axis-labels and legend of the plot
1072 matplotlib.artist.setp(self.
axis.get_yaxis(), visible=
False)
1074 self.
axis.set_title(
"Box Plot")
1080 Plots the difference between two histograms
1095 def __init__(self, figure=None, axis=None, normed=False, shift_to_zero=False):
1097 Creates a new figure and axis if None is given, sets the default plot parameters
1098 @param figure default draw figure which is used
1099 @param axis default draw axis which is used
1100 @param normed normalize minuend and subtrahend before comparing them
1101 @param shift_to_zero mean difference is shifted to zero, to remove constant offset due to e.g. different sample sizes
1113 def add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None):
1115 Add a new difference plot
1116 @param data pandas.DataFrame containing all data
1117 @param column which is used to calculate distribution histogram
1118 @param minuend_mask boolean numpy.array defining which events are for the minuend histogram
1119 @param subtrahend_mask boolean numpy.array defining which events are for the subtrahend histogram
1120 @param weight_column column in data containing the weights for each event
1121 @param label label for the legend if None, the column name is used
1123 hists =
histogram.Histograms(data, column, {
'Minuend': minuend_mask,
'Subtrahend': subtrahend_mask},
1124 weight_column=weight_column, equal_frequency=
False)
1125 minuend, minuend_error = hists.get_hist(
'Minuend')
1126 subtrahend, subtrahend_error = hists.get_hist(
'Subtrahend')
1130 difference_error = difference_error / (numpy.sum(minuend) + numpy.sum(subtrahend))
1131 minuend = minuend / numpy.sum(minuend)
1132 subtrahend = subtrahend / numpy.sum(subtrahend)
1133 difference = minuend - subtrahend
1136 difference = difference - numpy.mean(difference)
1140 hists.bin_centers, self.
xmin)), numpy.nanmax(
1142 hists.bin_centers, self.
xmax))
1143 self.
ymin, self.
ymax = numpy.nanmin(numpy.append(difference - difference_error, self.
ymin)
1144 ), numpy.nanmax(numpy.append(difference + difference_error, self.
ymax))
1147 p = self.
_plot_datapoints(self.
axis, hists.bin_centers, difference, xerr=hists.bin_widths / 2, yerr=difference_error)
1148 self.
plots.append(p)
1150 self.
labels.append(label)
1152 self.
labels.append(column)
1158 Sets limits, title, axis-labels and legend of the plot
1160 self.
axis.
plot((self.
xmin, self.
xmax), (0, 0), color=line_color, linewidth=4, rasterized=
True)
1163 self.
axis.set_title(
"Difference Plot")
1164 self.
axis.get_yaxis().set_major_locator(matplotlib.ticker.MaxNLocator(5))
1166 self.
axis.get_yaxis().set_label_text(
'Diff.')
1167 self.
axis.legend([x[0]
for x
in self.
plots], self.
labels, loc=
'best', fancybox=
True, framealpha=0.5)
1173 Create TMVA-like overtraining control plot for a classification training
1187 Creates a new figure if None is given, sets the default plot parameters
1188 @param figure default draw figure which is used
1189 @param dpi dpi for the matplotlib figure, if None default is used
1195 self.
figure = matplotlib.figure.Figure(figsize=(12, 8), dpi=self.
dpi)
1199 gs = matplotlib.gridspec.GridSpec(5, 1)
1209 def add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None):
1211 Add a new overtraining plot, I recommend to draw only one overtraining plot at the time,
1212 otherwise there are too many curves in the plot to recognize anything in the plot.
1213 @param data pandas.DataFrame containing all data
1214 @param column which is used to calculate distribution histogram
1215 @param train_mask boolean numpy.array defining which events are training events
1216 @param test_mask boolean numpy.array defining which events are test events
1217 @param signal_mask boolean numpy.array defining which events are signal events
1218 @param bckgrd_mask boolean numpy.array defining which events are background events
1219 @param weight_column column in data containing the weights for each event
1222 self.
axis.set_yscale(
'log')
1227 distribution.add(data, column, test_mask & signal_mask, weight_column)
1228 distribution.add(data, column, test_mask & bckgrd_mask, weight_column)
1230 distribution.set_plot_options(
1231 {
'color': distribution.plots[0][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
1232 distribution.set_fill_options({
'color': distribution.plots[0][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
1233 distribution.set_errorbar_options(
None)
1234 distribution.set_errorband_options(
None)
1235 distribution.add(data, column, train_mask & signal_mask, weight_column)
1236 distribution.set_plot_options(
1237 {
'color': distribution.plots[1][0][0].get_color(),
'linestyle':
'-',
'lw': 4,
'drawstyle':
'steps-mid'})
1238 distribution.set_fill_options({
'color': distribution.plots[1][0][0].get_color(),
'alpha': 0.5,
'step':
'post'})
1239 distribution.add(data, column, train_mask & bckgrd_mask, weight_column)
1241 distribution.labels = [
'Test-Signal',
'Test-Background',
'Train-Signal',
'Train-Background']
1242 distribution.finish()
1244 self.
plot_kwargs[
'color'] = distribution.plots[0][0][0].get_color()
1246 difference_signal.set_plot_options(self.
plot_kwargs)
1249 difference_signal.add(data, column, train_mask & signal_mask, test_mask & signal_mask, weight_column)
1250 self.
axis_d1.set_xlim((difference_signal.xmin, difference_signal.xmax))
1251 self.
axis_d1.set_ylim((difference_signal.ymin, difference_signal.ymax))
1252 difference_signal.plots = difference_signal.labels = []
1253 difference_signal.finish(line_color=distribution.plots[0][0][0].get_color())
1255 self.
plot_kwargs[
'color'] = distribution.plots[1][0][0].get_color()
1257 difference_bckgrd.set_plot_options(self.
plot_kwargs)
1260 difference_bckgrd.add(data, column, train_mask & bckgrd_mask, test_mask & bckgrd_mask, weight_column)
1261 self.
axis_d2.set_xlim((difference_bckgrd.xmin, difference_bckgrd.xmax))
1262 self.
axis_d2.set_ylim((difference_bckgrd.ymin, difference_bckgrd.ymax))
1263 difference_bckgrd.plots = difference_bckgrd.labels = []
1264 difference_bckgrd.finish(line_color=distribution.plots[1][0][0].get_color())
1269 if len(data[column][train_mask & signal_mask]) == 0
or len(data[column][test_mask & signal_mask]) == 0:
1270 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for signal due to missing data")
1272 ks = scipy.stats.ks_2samp(data[column][train_mask & signal_mask], data[column][test_mask & signal_mask])
1273 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
1274 self.
axis_d1.text(0.1, 0.9,
r'signal (train - test) difference $p={:.2f}$'.format(ks[1]), bbox=props,
1275 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d1.transAxes)
1276 if len(data[column][train_mask & bckgrd_mask]) == 0
or len(data[column][test_mask & bckgrd_mask]) == 0:
1277 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test for background due to missing data")
1279 ks = scipy.stats.ks_2samp(data[column][train_mask & bckgrd_mask], data[column][test_mask & bckgrd_mask])
1280 props = dict(boxstyle=
'round', edgecolor=
'gray', facecolor=
'white', linewidth=0.1, alpha=0.5)
1281 self.
axis_d2.text(0.1, 0.9,
r'background (train - test) difference $p={:.2f}$'.format(ks[1]),
1283 verticalalignment=
'top', horizontalalignment=
'left', transform=self.
axis_d2.transAxes)
1285 b2.B2WARNING(
"Cannot calculate kolmogorov smirnov test please install scipy!")
1291 Sets limits, title, axis-labels and legend of the plot
1293 self.
axis.set_title(
"Overtraining Plot")
1296 matplotlib.artist.setp(self.
axis.get_xticklabels(), visible=
False)
1297 matplotlib.artist.setp(self.
axis_d1.get_xticklabels(), visible=
False)
1298 self.
axis.get_xaxis().set_label_text(
'')
1299 self.
axis_d1.get_xaxis().set_label_text(
'')
1300 self.
axis_d2.get_xaxis().set_label_text(
'Classifier Output')
1306 Plots distribution of a quantity including boxplots
1312 def __init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None):
1314 Creates a new figure and axis if None is given, sets the default plot parameters
1315 @param figure default draw figure which is used
1316 @param axis default draw axis which is used
1317 @param normed true if the histograms should be normed before drawing
1318 @param range_in_std show only the data in a windows around +- range_in_std * standard_deviation around the mean
1332 def add(self, data, column, mask=None, weight_column=None, label=None):
1334 Add a new distribution plot, with additional information like a boxplot compared to
1335 the ordinary Distribution plot.
1336 @param data pandas.DataFrame containing all data
1337 @param column which is used to calculate distribution histogram
1338 @param mask boolean numpy.array defining which events are used for the distribution histogram
1339 @param weight_column column in data containing the weights for each event
1340 @param label label for the plot legend
1348 gs = matplotlib.gridspec.GridSpec(4 * n, 1)
1349 gridspecs = [gs[:3 * n, :]] + [gs[3 * n + i, :]
for i
in range(n)]
1355 mask = mask & (data[column] > (mean - self.
range_in_std * std)) & (data[column] < (mean + self.
range_in_std * std))
1357 box.add(data, column, mask, weight_column)
1358 if len(box.plots) > 0:
1359 box.plots[0][
'boxes'][0].set_facecolor(self.
distribution.plots[-1][0][0].get_color())
1367 Sets limits, title, axis-labels and legend of the plot
1370 matplotlib.artist.setp(self.
axis.get_xticklabels(), visible=
False)
1371 self.
axis.get_xaxis().set_label_text(
'')
1372 for box_axis
in self.
box_axes[:-1]:
1373 matplotlib.artist.setp(box_axis.get_xticklabels(), visible=
False)
1374 box_axis.set_title(
"")
1375 box_axis.get_xaxis().set_label_text(
'')
1377 self.
axis.set_title(
"Distribution Plot")
1379 loc=
'best', fancybox=
True, framealpha=0.5)
1385 Plots change of a distribution of a quantity depending on the cut on a classifier
1398 Creates a new figure if None is given, sets the default plot parameters
1399 @param figure default draw figure which is used
1400 @param dpi dpi for the matplotlib figure, if None default is used
1406 self.
figure = matplotlib.figure.Figure(figsize=(12, 8), dpi=self.
dpi)
1410 gs = matplotlib.gridspec.GridSpec(3, 2)
1420 def add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None):
1422 Add a new correlation plot.
1423 @param data pandas.DataFrame containing all data
1424 @param column which is used to calculate distribution histogram
1425 @param cut_column which is used to calculate cut on the other quantity defined by column
1426 @param quantiles list of quantiles between 0 and 100, defining the different cuts
1427 @param weight_column column in data containing the weights for each event
1429 if len(data[cut_column]) == 0:
1430 b2.B2WARNING(
"Ignore empty Correlation.")
1435 for i, (l, m)
in enumerate([(
'.', signal_mask | bckgrd_mask), (
'S', signal_mask), (
'B', bckgrd_mask)]):
1436 if weight_column
is not None:
1437 weights = numpy.array(data[weight_column][m])
1439 weights = numpy.ones(len(data[column][m]))
1441 xrange = numpy.percentile(data[column][m], [5, 95])
1442 isfinite = numpy.isfinite(data[column][m])
1443 if not numpy.all(isfinite):
1444 xrange = numpy.percentile(data[column][m][isfinite], [5, 95])
1445 elif numpy.all(numpy.isnan(data[column][m])):
1446 b2.B2WARNING(
"All data is NaN, cannot calculate range and ignore Correlation.")
1449 colormap = plt.get_cmap(
'coolwarm')
1450 tmp, x = numpy.histogram(data[column][m], bins=100,
1451 range=xrange, density=
True, weights=weights)
1452 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1453 axes[i].
plot(bin_center, tmp, color=
'black', lw=1)
1455 for quantil
in numpy.arange(5, 100, 5):
1456 cut = numpy.percentile(data[cut_column][m], quantil)
1457 sel = data[cut_column][m] >= cut
1458 y, x = numpy.histogram(data[column][m][sel], bins=100,
1459 range=xrange, density=
True, weights=weights[sel])
1460 bin_center = ((x + numpy.roll(x, 1)) / 2)[1:]
1461 axes[i].fill_between(bin_center, tmp, y, color=colormap(quantil / 100.0))
1464 axes[i].set_ylim(bottom=0)
1467 axes[i].set_title(
r'Distribution for different quantiles: $\mathrm{{Flatness}}_{} = {:.3f}$'.format(l, flatness_score))
1472 Sets limits, title, axis-labels and legend of the plot
1479 Plots multivariate distribution using TSNE algorithm
1482 def add(self, data, columns, *masks):
1484 Add a new correlation plot.
1485 @param data pandas.DataFrame containing all data
1486 @param columns which are used to calculate the correlations
1487 @param masks different classes to show in TSNE
1491 import sklearn.manifold
1492 model = sklearn.manifold.TSNE(n_components=2, random_state=0)
1493 data = numpy.array([data[column]
for column
in columns]).T
1496 data = numpy.array([data[column][mask]
for column
in columns]).T
1497 data = model.transform(data)
1498 self.
axis.scatter(data[:, 0], data[:, 1], rasterized=
True)
1500 print(
"Cannot create TSNE plot. Install sklearn if you want it")
1505 Sets limits, title, axis-labels and legend of the plot
1512 Plots importance matrix
1515 def add(self, data, columns, variables):
1517 Add a new correlation plot.
1518 @param data pandas.DataFrame containing all data
1519 @param columns which are used to calculate the correlations
1523 width = (numpy.max(x) - numpy.min(x))
1525 return numpy.zeros(x.shape)
1526 return (x - numpy.min(x)) / width * 100
1528 importance_matrix = numpy.vstack([norm(data[column])
for column
in columns]).T
1529 im = self.
axis.imshow(
1530 importance_matrix[::-1],
1535 interpolation=
'nearest',
1539 num_y, num_x = importance_matrix.shape
1543 font_size = max(6, base_font_size * min(1.0, 25 / max(num_x, num_y)))
1546 self.
axis.set_xticks(numpy.arange(num_x))
1547 self.
axis.set_yticks(numpy.arange(num_y))
1549 self.
axis.set_xticklabels(columns, rotation=90, fontsize=font_size)
1550 self.
axis.set_yticklabels(reversed(variables), fontsize=font_size)
1552 self.
axis.tick_params(top=
True, bottom=
False, labeltop=
True, labelbottom=
False)
1555 for y
in range(num_y):
1556 for x
in range(num_x):
1557 value = importance_matrix[-1-y, x]
1558 txt = self.
axis.text(
1559 x, y, f
'{value:.0f}',
1560 ha=
'center', va=
'center',
1564 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'black')])
1567 cb = self.
figure.colorbar(im, ax=self.
axis, ticks=[0.0, 100.0], orientation=
'vertical')
1568 cb.ax.set_yticklabels([
'low',
'high'])
1569 cb.solids.set_rasterized(
True)
1572 self.
axis.set_xlim(-0.5, num_x - 0.5)
1573 self.
axis.set_ylim(num_y - 0.5, -0.5)
1579 Sets limits, title, axis-labels and legend of the plot
1586 Plots correlation matrix
1597 Creates a new figure if None is given, sets the default plot parameters
1598 @param figure default draw figure which is used
1599 @param dpi dpi for the matplotlib figure, if None default is used
1605 self.
figure = matplotlib.figure.Figure(figsize=(12, 8), dpi=self.
dpi)
1609 gs = matplotlib.gridspec.GridSpec(8, 2)
1621 def add(self, data, columns, signal_mask, bckgrd_mask):
1623 Add a new correlation plot.
1624 @param data pandas.DataFrame containing all data
1625 @param columns which are used to calculate the correlations
1627 num_vars = len(columns)
1628 font_size = max(4, min(14, 200 // num_vars))
1630 signal_corr = numpy.corrcoef(numpy.vstack([data[column][signal_mask]
for column
in columns])) * 100
1631 bckgrd_corr = numpy.corrcoef(numpy.vstack([data[column][bckgrd_mask]
for column
in columns])) * 100
1634 signal_corr[::-1, ::-1],
1640 interpolation=
'nearest')
1642 bckgrd_corr[::-1, ::-1],
1648 interpolation=
'nearest')
1651 tick_positions = numpy.arange(num_vars)
1657 self.
signal_axis.set_xticklabels(reversed(columns), rotation=90, fontsize=font_size)
1658 self.
signal_axis.set_yticklabels(reversed(columns), fontsize=font_size)
1666 self.
bckgrd_axis.set_xticklabels(reversed(columns), rotation=90, fontsize=font_size)
1667 self.
bckgrd_axis.set_yticklabels(reversed(columns), fontsize=font_size)
1672 for y
in range(num_vars):
1673 for x
in range(num_vars):
1674 txt = self.
signal_axis.text(x, y, f
'{signal_corr[-1-y, -1-x]:.0f}',
1675 ha=
'center', va=
'center',
1678 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1679 txt = self.
bckgrd_axis.text(x, y, f
'{bckgrd_corr[-1-y, -1-x]:.0f}',
1680 ha=
'center', va=
'center',
1683 txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground=
'k')])
1687 ticks=[-100, 0, 100], orientation=
'horizontal')
1688 cb.solids.set_rasterized(
True)
1689 cb.ax.set_xticklabels([
'negative',
'uncorrelated',
'positive'])
1695 Sets limits, title, axis-labels and legend of the plot
1697 matplotlib.artist.setp(self.
bckgrd_axis.get_yticklabels(), visible=
False)
1701if __name__ ==
'__main__':
1703 def get_data(N, columns):
1705 Creates fake data for example plots
1708 n = len(columns) - 1
1709 xs = numpy.random.normal(0, size=(N, n))
1710 xb = numpy.random.normal(1, size=(N, n))
1713 data = pandas.DataFrame(numpy.c_[numpy.r_[xs, xb], numpy.r_[ys, yb]], columns=columns)
1714 return data.reindex(numpy.random.permutation(data.index))
1718 seaborn.set(font_scale=3)
1719 seaborn.set_style(
'whitegrid')
1723 data = get_data(N, columns=[
'FastBDT',
'NeuroBayes',
'isSignal'])
1725 data.type.iloc[:N / 2] =
'Train'
1726 data.type.iloc[N / 2:] =
'Test'
1729 p.add(data,
'FastBDT')
1731 p.save(
'box_plot.png')
1734 p.add(data,
'FastBDT')
1735 p.add(data,
'NeuroBayes')
1737 p.save(
'verbose_distribution_plot.png')
1740 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1741 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1743 p.save(
'roc_purity_plot.png')
1746 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1747 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1749 p.save(
'roc_rejection_plot.png')
1752 p.add(data,
'FastBDT', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1753 p.add(data,
'NeuroBayes', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1755 p.save(
'diagonal_plot.png')
1758 p.add(data,
'FastBDT')
1759 p.add(data,
'NeuroBayes')
1761 p.save(
'distribution_plot.png')
1764 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1765 p.add(data,
'NeuroBayes', data[
'type'] ==
'Train', data[
'type'] ==
'Test')
1767 p.save(
'difference_plot.png')
1770 p.add(data,
'FastBDT', data[
'type'] ==
'Train', data[
'type'] ==
'Test', data[
'isSignal'] == 1, data[
'isSignal'] == 0)
1772 p.save(
'overtraining_plot.png')
1775 p.add(data,
'FastBDT',
'NeuroBayes', [0, 20, 40, 60, 80, 100], data[
'isSignal'] == 0)
1777 p.save(
'correlation_plot.png')
1780 data[
'FastBDT2'] = data[
'FastBDT']**2
1781 data[
'NeuroBayes2'] = data[
'NeuroBayes']**2
1782 data[
'FastBDT3'] = data[
'FastBDT']**3
1783 data[
'NeuroBayes3'] = data[
'NeuroBayes']**3
1784 p.add(data, [
'FastBDT',
'NeuroBayes',
'FastBDT2',
'NeuroBayes2',
'FastBDT3',
'NeuroBayes3'])
1786 p.save(
'correlation_matrix.png')
calculate_flatness(f, p, w=None)
__init__(self, figure=None, axis=None, x_axis_label=None)
x_axis_label
Label on x axis.
add(self, data, column, mask=None, weight_column=None)
signal_axis
Main axis which shows the correlation of the signal samples.
colorbar_axis
add signal subplot
add(self, data, columns, signal_mask, bckgrd_mask)
__init__(self, figure=None, dpi=None)
bckgrd_axis
Axis which shows the correlation of the background samples.
axis_d1
Axis which shows shape of signal.
__init__(self, figure=None, dpi=None)
axis_d2
Axis which shows shape of background.
add(self, data, column, cut_column, quantiles, signal_mask=None, bckgrd_mask=None, weight_column=None)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
x_axis_label
Label on x axis.
shift_to_zero
Mean difference is shifted to zero (removes constant offset) if this is true.
__init__(self, figure=None, axis=None, normed=False, shift_to_zero=False)
add(self, data, column, minuend_mask, subtrahend_mask, weight_column=None, label=None)
normed
Minuend and subtrahend are normed before comparing them if this is true.
finish(self, line_color='black')
str x_axis_label
x axis label
keep_first_binning
Keep first binning if user wants so.
normed_to_all_entries
Normalize histograms before drawing them.
first_binning
first binning
__init__(self, figure=None, axis=None, normed_to_all_entries=False, normed_to_bin_width=False, keep_first_binning=False, range_in_std=None)
range_in_std
Show only a certain range in terms of standard deviations of the data.
add(self, data, column, mask=None, weight_column=None, label=None)
normed_to_bin_width
Normalize histograms before drawing them.
add(self, data, columns, variables)
list sub_plots
the subplots which are displayed in the grid
__init__(self, cls, number_of_plots, figure=None, dpi=None)
add(self, i, *args, **kwargs)
axis_d1
Axis which shows the difference between training and test signal.
__init__(self, figure=None, dpi=None)
axis_d2
Axis which shows the difference between training and test background.
add(self, data, column, train_mask, test_mask, signal_mask, bckgrd_mask, weight_column=None)
list plots
Plots added to the axis so far.
list labels
Labels of the plots added so far.
fill_kwargs
Default keyword arguments for fill_between function.
float yscale
create figure
add_subplot(self, gridspecs)
figure
figure which is used to draw
add(self, *args, **kwargs)
errorband_kwargs
Default keyword arguments for errorband function.
set_fill_options(self, fill_kwargs=None)
finish(self, *args, **kwargs)
set_plot_options(self, plot_kwargs={ 'linestyle':''})
prop_cycler
Property cycler used to give plots unique colors.
set_errorbar_options(self, errorbar_kwargs={ 'fmt':'.', 'elinewidth':3, 'alpha':1})
Overrides default errorbar options for datapoint errorbars.
errorbar_kwargs
Default keyword arguments for errorbar function.
float xscale
create figure
__init__(self, figure=None, axis=None, dpi=None)
_plot_datapoints(self, axis, x, y, xerr=None, yerr=None)
axis
Main axis which is used to draw.
setAxisLimits(self, factor=0.0)
set_errorband_options(self, errorband_kwargs={ 'alpha':0.5})
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, normed=True)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
add(self, data, columns, *masks)
add(self, data, column, signal_mask, bckgrd_mask, weight_column=None, label=None)
distribution
create empty list for box axes
list box_axes
Axes for the boxplots.
range_in_std
Show only a certain range in terms of standard deviations of the data.
add(self, data, column, mask=None, weight_column=None, label=None)
__init__(self, figure=None, axis=None, normed=False, range_in_std=None, x_axis_label=None)
normed
Normalize histograms before drawing them.
weighted_mean_and_std(x, w)