31 """Get for the logging.Logger instance of this module
36 Logger instance of this module
38 return logging.getLogger(__name__)
42units_by_quantity_name = {
67def get_unit(quantity_name):
68 """Infers the unit of a quantity from its name.
70 Assumes the standard Belle II unit system.
72 Currently looks up the quantity string from units_by_quantity_name.
77 Name of a quantity (E.g. pt, x, ...)
84 unit = units_by_quantity_name.get(quantity_name,
None)
88def compose_axis_label(quantity_name, unit=None):
89 """Formats a quantity name and a unit to a label for a plot axes.
91 If the unit is not given to is tried to infer it
92 from the quantity name by the get_unit function.
97 Name of the quantity to be displayed at the axes
99 The unit of the quantity. Defaults to get_unit(quantity_name)
107 unit = get_unit(quantity_name)
110 axis_label = quantity_name
112 axis_label = f
'{quantity_name} ({unit})'
117def find_object_recursive(tFileOrTDirectory, name):
119 Recursively search for an object in a ROOT file by name.
121 - First searches the top-level directory.
122 - Then searches subdirectories recursively.
123 - Includes fallback using gDirectory in case ROOT key table is incomplete.
125 @param tFileOrTDirectory (ROOT.TDirectory or ROOT.TFile): The file or directory to search in.
126 @param name (str): The name of the object to find.
128 @return ROOT object or None if not found.
132 obj = tFileOrTDirectory.Get(name)
142 for key
in tFileOrTDirectory.GetListOfKeys():
143 if key.GetClassName() ==
'TDirectoryFile':
144 subdir = tFileOrTDirectory.Get(key.GetName())
145 obj = find_object_recursive(subdir, name)
152def get1DBinningFromReference(name, refFileName):
153 """ returns nbins, lowerbound, upperbound for TH1 / TProfile with name "name" found in the file "refFileName"
155 @param name : name of the TH1 object to be looked for in the file
156 @param refFileName : name of the reference file where the object is searched for
158 @return int nbin, float xmin, float xmax of the TH1
165 if refFileName
is None or refFileName ==
"":
166 return nbins, x_min, x_max
169 oldDirectory = ROOT.gROOT.CurrentDirectory().load()
171 tfile = ROOT.TFile(refFileName)
173 objptr = find_object_recursive(tfile, name)
174 if objptr
and objptr.InheritsFrom(
"TH1"):
175 nbins = objptr.GetNbinsX()
176 x_min = objptr.GetXaxis().GetXmin()
177 x_max = objptr.GetXaxis().GetXmax()
179 basf2.B2WARNING(
'Requested object with name: ' + name +
' not found in file: ' + refFileName +
" (or not a TH1)")
181 basf2.B2WARNING(
'Requested file: ' + refFileName +
' could not be opened')
188 return nbins, x_min, x_max
192StatsEntry = ROOT.TParameter(float)
197 """Class for generating a validation plot for the Belle II validation page.
199 Typically it generates plots from values stored in numpy arrays and feeds them into
200 plot ROOT classes for storing them.
202 It implements an automatic binning procedure based on the rice rule and
203 robust z score outlier detection.
205 It also keeps track of additional statistics typically neglected by ROOT such as a count
206 for the non finit values such as NaN, +Inf, -Inf.
208 The special attributes for the Belle II validation page like
213 are exposed as properties of this class.
217 very_sparse_dots_line_style_index = 28
220 def __init__(self, name, referenceFileName=None):
221 """Constructor of the ValidationPlot
226 A unique name to be used as the name of the ROOT object to be generated
228 referenceFileName : str
229 name of a reference file. If set the code will try to get the histogram or profile
230 from that file and determine the number of bins and upper and lower bound
231 (so far only implemented for 1D (TH1, TProfile), is ignored for 2D plots)
235 self.name = root_save_name(name)
238 self.referenceFileName = referenceFileName
241 self._description =
''
265 self._is_expert =
True
268 self.pvalue_warn =
None
271 self.pvalue_error =
None
283 outlier_z_score=None,
284 include_exceptionals=True,
285 allow_discrete=False,
286 cumulation_direction=None,
288 """Fill the plot with a one dimensional histogram."""
291 if self.referenceFileName
is not None:
292 n, xmin, xmax = get1DBinningFromReference(self.name, self.referenceFileName)
293 if n
is not None and xmin
is not None and xmax
is not None:
298 th1_factory = ROOT.TH1D
299 self._is_expert = is_expert
301 self.create_1d(th1_factory,
306 lower_bound=lower_bound,
307 upper_bound=upper_bound,
308 outlier_z_score=outlier_z_score,
309 include_exceptionals=include_exceptionals,
310 allow_discrete=allow_discrete,
311 cumulation_direction=cumulation_direction)
315 self.ylabel =
'count'
329 outlier_z_score=None,
330 include_exceptionals=True,
331 allow_discrete=False,
332 cumulation_direction=None,
336 """Fill the plot with a one dimensional profile of one variable over another."""
339 if self.referenceFileName
is not None:
343 n, xmin, xmax = get1DBinningFromReference(self.name, self.referenceFileName)
344 if n
is not None and xmin
is not None and xmax
is not None:
349 th1_factory = ROOT.TProfile
350 self._is_expert = is_expert
351 if gaus_z_score
is None:
352 self.create_1d(th1_factory,
358 lower_bound=lower_bound,
359 upper_bound=upper_bound,
360 outlier_z_score=outlier_z_score,
361 include_exceptionals=include_exceptionals,
362 allow_discrete=allow_discrete,
363 cumulation_direction=cumulation_direction)
366 self.name =
"_" + self.name
367 self.hist2d(xs, ys=ys, weights=weights, stackby=stackby,
369 lower_bound=(lower_bound,
None),
370 upper_bound=(upper_bound,
None),
371 outlier_z_score=(outlier_z_score, outlier_z_score),
372 include_exceptionals=(include_exceptionals,
True),
373 allow_discrete=(allow_discrete,
False),
376 self.name = self.name[1:]
378 for histogram
in self.histograms:
379 profile = self.gaus_slice_fit(histogram,
380 name=histogram.GetName()[1:],
381 z_score=gaus_z_score)
382 profiles.append(profile)
383 self.histograms = profiles
384 self.plot = self.create_stack(profiles, name=self.plot.GetName()[1:], reverse_stack=
False)
389 if y_binary
or self.is_binary(ys):
391 self.ylabel =
'probability'
396 for histogram
in self.histograms:
397 histogram.SetMinimum(min_y)
398 histogram.SetMaximum(1.05)
400 self.plot.SetMinimum(min_y)
401 self.plot.SetMaximum(1.05)
409 lower_bound=(
None,
None),
410 upper_bound=(
None,
None),
411 outlier_z_score=(
None,
None),
412 include_exceptionals=(
True,
True),
415 """Fill the plot with a (unbinned) two dimensional scatter plot"""
417 self._is_expert = is_expert
419 x_lower_bound, y_lower_bound = self.unpack_2d_param(lower_bound)
420 x_upper_bound, y_upper_bound = self.unpack_2d_param(upper_bound)
421 x_outlier_z_score, y_outlier_z_score = self.unpack_2d_param(outlier_z_score)
422 x_include_exceptionals, y_include_exceptionals = self.unpack_2d_param(include_exceptionals)
424 x_lower_bound, x_upper_bound = self.determine_range(
426 lower_bound=x_lower_bound,
427 upper_bound=x_upper_bound,
428 outlier_z_score=x_outlier_z_score,
429 include_exceptionals=x_include_exceptionals
432 y_lower_bound, y_upper_bound = self.determine_range(
434 lower_bound=y_lower_bound,
435 upper_bound=y_upper_bound,
436 outlier_z_score=y_outlier_z_score,
437 include_exceptionals=y_include_exceptionals
440 graph = ROOT.TGraph()
442 graph.SetName(self.name)
443 graph.SetMarkerStyle(6)
444 graph.GetHistogram().SetOption(
"AP")
449 graph.SetLineColorAlpha(color_index, 0)
450 graph.SetLineStyle(self.very_sparse_dots_line_style_index)
453 graph.GetXaxis().SetLimits(x_lower_bound, x_upper_bound)
454 graph.GetYaxis().SetLimits(y_lower_bound, y_upper_bound)
464 def grapherrors(self,
468 lower_bound=(
None,
None),
469 upper_bound=(
None,
None),
470 outlier_z_score=(
None,
None),
471 include_exceptionals=(
True,
True),
474 """Fill the plot with a (unbinned) two dimensional scatter plot
475 xs_and_err and ys_and_err are tuples containing the values and the errors on these values
479 self._is_expert = is_expert
484 x_lower_bound, y_lower_bound = self.unpack_2d_param(lower_bound)
485 x_upper_bound, y_upper_bound = self.unpack_2d_param(upper_bound)
486 x_outlier_z_score, y_outlier_z_score = self.unpack_2d_param(outlier_z_score)
487 x_include_exceptionals, y_include_exceptionals = self.unpack_2d_param(include_exceptionals)
489 x_lower_bound, x_upper_bound = self.determine_range(
491 lower_bound=x_lower_bound,
492 upper_bound=x_upper_bound,
493 outlier_z_score=x_outlier_z_score,
494 include_exceptionals=x_include_exceptionals
497 y_lower_bound, y_upper_bound = self.determine_range(
499 lower_bound=y_lower_bound,
500 upper_bound=y_upper_bound,
501 outlier_z_score=y_outlier_z_score,
502 include_exceptionals=y_include_exceptionals
505 graph = ROOT.TGraphErrors()
507 graph.SetName(self.name)
508 graph.GetHistogram().SetOption(
"A")
510 graph.SetMarkerColor(4)
511 graph.SetMarkerStyle(21)
514 graph.GetXaxis().SetLimits(x_lower_bound, x_upper_bound)
515 graph.GetYaxis().SetLimits(y_lower_bound, y_upper_bound)
531 lower_bound=(
None,
None),
532 upper_bound=(
None,
None),
533 outlier_z_score=(
None,
None),
534 include_exceptionals=(
True,
True),
535 allow_discrete=(
False,
False),
538 """Fill the plot with a two dimensional histogram"""
542 if quantiles
is not None:
543 name =
"_" + self.name
545 x_bins, y_bins = self.unpack_2d_param(bins)
546 x_lower_bound, y_lower_bound = self.unpack_2d_param(lower_bound)
547 x_upper_bound, y_upper_bound = self.unpack_2d_param(upper_bound)
548 x_outlier_z_score, y_outlier_z_score = self.unpack_2d_param(outlier_z_score)
549 x_include_exceptionals, y_include_exceptionals = self.unpack_2d_param(include_exceptionals)
550 x_allow_discrete, y_allow_discrete = self.unpack_2d_param(allow_discrete)
552 if quantiles
is not None:
553 y_include_exceptionals =
True
554 y_allow_discrete =
False
556 x_bin_edges, x_bin_labels = self.determine_bin_edges(xs,
559 lower_bound=x_lower_bound,
560 upper_bound=x_upper_bound,
561 outlier_z_score=x_outlier_z_score,
562 include_exceptionals=x_include_exceptionals,
563 allow_discrete=x_allow_discrete)
565 y_bin_edges, y_bin_labels = self.determine_bin_edges(ys,
568 lower_bound=y_lower_bound,
569 upper_bound=y_upper_bound,
570 outlier_z_score=y_outlier_z_score,
571 include_exceptionals=y_include_exceptionals,
572 allow_discrete=y_allow_discrete)
574 n_x_bins = len(x_bin_edges) - 1
575 n_y_bins = len(y_bin_edges) - 1
577 self.lower_bound = (x_bin_edges[0], y_bin_edges[0])
579 self.upper_bound = (x_bin_edges[-1], y_bin_edges[-1])
581 histogram = ROOT.TH2D(name,
589 get_logger().info(
"Scatter plot %s is discrete in x.", name)
590 x_taxis = histogram.GetXaxis()
591 for i_x_bin, x_bin_label
in enumerate(x_bin_labels):
592 x_taxis.SetBinLabel(i_x_bin + 1, x_bin_label)
593 self.add_stats_entry(histogram,
"dx", 1)
596 x_bin_width = x_bin_edges[1] - x_bin_edges[0]
597 self.add_stats_entry(histogram,
"dx", x_bin_width)
600 get_logger().info(
"Scatter plot %s is discrete in y.", name)
601 y_taxis = histogram.GetYaxis()
602 for i_y_bin, y_bin_label
in enumerate(y_bin_labels):
603 y_taxis.SetBinLabel(i_y_bin + 1, y_bin_label)
604 self.add_stats_entry(histogram,
"dy", 1)
607 y_bin_width = y_bin_edges[1] - y_bin_edges[0]
608 self.add_stats_entry(histogram,
"dy", y_bin_width)
610 self.create(histogram, xs, ys=ys, weights=weights, stackby=stackby)
612 if quantiles
is not None:
613 self.name = self.name[1:]
615 for histogram
in self.histograms:
616 for quantile
in quantiles:
617 profile = histogram.QuantilesX(quantile, histogram.GetName()[1:] +
'_' + str(quantile))
620 x_taxis = histogram.GetXaxis()
621 new_x_taxis = profile.GetXaxis()
622 for i_bin
in range(x_taxis.GetNbins() + 2):
623 label = x_taxis.GetBinLabel(i_bin)
625 new_x_taxis.SetBinLabel(i_bin, label)
628 epsilon = sys.float_info.epsilon
629 for i_bin
in range(0, profile.GetNbinsX() + 2):
630 profile.SetBinError(i_bin, epsilon)
632 profiles.append(profile)
634 self.histograms = profiles
635 self.plot = self.create_stack(profiles, name=self.plot.GetName()[1:], reverse_stack=
False, force_graph=
True)
639 for histogram
in self.histograms:
640 x_taxis = histogram.GetXaxis()
641 x_bin_edges = array.array(
"d", list(range(len(x_bin_labels) + 1)))
642 x_taxis.Set(n_x_bins, x_bin_edges)
646 for histogram
in self.histograms:
647 x_taxis = histogram.GetXaxis()
648 y_bin_edges = array.array(
"d", list(range(len(y_bin_labels) + 1)))
649 y_taxis.Set(n_y_bins, y_bin_edges)
653 def fit_gaus(self, z_score=None):
654 """Fit a Gaus bell curve to the central portion of a one dimensional histogram
656 The fit is applied to the central mean +- z_score * std interval of the histogram,
657 such that it is less influence by non gaussian tails further away than the given z score.
659 @param float z_score number of sigmas to include from the mean value of the histogram.
666 raise RuntimeError(
'Validation plot must be filled before it can be fitted.')
668 if not isinstance(plot, ROOT.TH1D):
669 raise RuntimeError(
'Fitting is currently implemented / tested for one dimensional, non stacked validation plots.')
673 fit_tf1 = ROOT.TF1(
"Fit", formula)
674 fit_tf1.SetTitle(title)
675 fit_tf1.SetParName(0,
"n")
676 fit_tf1.SetParName(1,
"mean")
677 fit_tf1.SetParName(2,
"std")
679 n = histogram.GetSumOfWeights()
680 mean = histogram.GetMean()
681 std = histogram.GetStdDev()
683 fit_tf1.SetParameter(0, n)
684 fit_tf1.SetParameter(1, mean)
685 fit_tf1.SetParameter(2, std)
688 return self.fit(fit_tf1,
693 """Fit a general line to a one dimensional histogram"""
696 fit_tf1 = ROOT.TF1(
"Fit", formula)
697 fit_tf1.SetTitle(title)
698 fit_tf1.SetParName(0,
"slope")
699 fit_tf1.SetParName(1,
"intercept")
700 self.fit(fit_tf1,
'M')
703 """Fit a constant function to a one dimensional histogram"""
706 fit_tf1 = ROOT.TF1(
"Fit", formula)
707 fit_tf1.SetTitle(title)
708 fit_tf1.SetParName(0,
"intercept")
709 self.fit(fit_tf1,
'M')
712 """Fit a diagonal line through the origin to a one dimensional histogram"""
715 fit_tf1 = ROOT.TF1(
"Fit", formula)
716 fit_tf1.SetTitle(title)
717 fit_tf1.SetParName(0,
"slope")
718 self.fit(fit_tf1,
'M')
720 def fit(self, formula, options, lower_bound=None, upper_bound=None, z_score=None):
721 """Fit a user defined function to a one dimensional histogram
726 Formula string or TH1 to be fitted. See TF1 constructors for that is a valid formula
728 Options string to be used in the fit. See TH1::Fit()
730 Lower bound of the range to be fitted
732 Upper bound of the range to be fitted
736 raise RuntimeError(
'Validation plot must be filled before it can be fitted.')
738 if not isinstance(plot, ROOT.TH1D):
739 raise RuntimeError(
'Fitting is currently implemented / tested for one dimensional, non stacked validation plots.')
743 xaxis = histogram.GetXaxis()
744 n_bins = xaxis.GetNbins()
745 hist_lower_bound = xaxis.GetBinLowEdge(1)
746 hist_upper_bound = xaxis.GetBinUpEdge(n_bins)
748 if z_score
is not None:
749 mean = histogram.GetMean()
750 std = histogram.GetStdDev()
752 if lower_bound
is None:
753 lower_bound = mean - z_score * std
755 if upper_bound
is None:
756 upper_bound = mean + z_score * std
759 if isinstance(formula, ROOT.TF1):
761 fit_tf1.SetRange(hist_lower_bound, hist_upper_bound)
763 fit_tf1 = ROOT.TF1(
"Fit",
767 get_logger().info(
'Fitting with %s', fit_tf1.GetExpFormula())
770 if lower_bound
is None or lower_bound < hist_lower_bound:
771 lower_bound = hist_lower_bound
772 if upper_bound
is None or upper_bound > hist_upper_bound:
773 upper_bound = hist_upper_bound
777 if 'N' not in options:
780 fit_res = histogram.Fit(fit_tf1, options +
"S",
"", lower_bound, upper_bound)
782 self.set_fit_tf1(histogram, fit_tf1)
790 raise ValueError(
"Can not show a validation plot that has not been filled.")
792 def write(self, tdirectory=None):
793 """Write the plot to file
797 tdirectory : ROOT.TDirectory, optional
798 ROOT directory to which the plot should be written.
799 If omitted write to the current directory
802 raise ValueError(
"Can not write a validation plot that has not been filled.")
804 with root_cd(tdirectory):
805 ValidationPlot.set_tstyle()
806 if self.plot
not in self.histograms:
810 meta_options = [
"nostats"]
814 meta_options.append(
"expert")
816 meta_options.append(
"shifter")
819 if self.pvalue_error
is not None:
820 meta_options.append(f
"pvalue-error={self.pvalue_error}")
821 if self.pvalue_warn
is not None:
822 meta_options.append(f
"pvalue-warn={self.pvalue_warn}")
826 meta_options.append(
"logy")
828 meta_options_str =
",".join(meta_options)
830 for histogram
in self.histograms:
831 histogram.GetListOfFunctions().Add(ROOT.TNamed(
'MetaOptions', meta_options_str))
836 """Getter method if an plot plot is marked as expert plot"""
837 return self._is_expert
841 """Getter for the plot title"""
845 def title(self, title):
846 """Setter for the plot title"""
849 self.plot.SetTitle(title)
850 for histogram
in self.histograms:
851 histogram.SetTitle(title)
855 """Getter for the axis label at the x axis"""
859 def xlabel(self, xlabel):
860 """Setter for the axis label at the x axis"""
861 self._xlabel = xlabel
862 for histogram
in self.histograms:
863 histogram.GetXaxis().SetTitle(xlabel)
867 """Getter for the axis label at the y axis"""
871 def ylabel(self, ylabel):
872 """Setter for the axis label at the y axis"""
873 self._ylabel = ylabel
874 for histogram
in self.histograms:
875 histogram.GetYaxis().SetTitle(ylabel)
879 """Getter for the contact email address to be displayed on the validation page"""
883 def contact(self, contact):
884 """Setter for the contact email address to be displayed on the validation page"""
885 self._contact = contact
886 for histogram
in self.histograms:
887 found_obj = histogram.FindObject(
'Contact')
889 tnamed = ROOT.TNamed(
"Contact", contact)
890 histogram.GetListOfFunctions().Add(tnamed)
891 found_obj = histogram.FindObject(
'Contact')
892 found_obj.SetTitle(contact)
895 def description(self):
896 """Getter for the description to be displayed on the validation page"""
897 return self._description
900 def description(self, description):
901 """Setter for the description to be displayed on the validation page"""
902 self._description = description
903 for histogram
in self.histograms:
904 found_obj = histogram.FindObject(
'Description')
906 tnamed = ROOT.TNamed(
"Description", description)
907 histogram.GetListOfFunctions().Add(tnamed)
908 found_obj = histogram.FindObject(
'Description')
909 found_obj.SetTitle(description)
913 """Getter for the check to be displayed on the validation page"""
917 def check(self, check):
918 """Setter for the check to be displayed on the validation page"""
920 for histogram
in self.histograms:
921 found_obj = histogram.FindObject(
'Check')
923 tnamed = ROOT.TNamed(
"Check", check)
924 histogram.GetListOfFunctions().Add(tnamed)
925 found_obj = histogram.FindObject(
'Check')
926 found_obj.SetTitle(check)
932 def unpack_2d_param(param):
933 """Unpacks a function parameter for the two dimensional plots.
935 If it is a pair the first parameter shall apply to the x coordinate
936 the second to the y coordinate. In this case the pair is returned as two values
938 If something else is given the it is assumed that this parameter should equally apply
939 to both coordinates. In this case the same values is return twice as a pair.
943 param : pair or single value
944 Function parameter for a two dimensional plot
949 A pair of values being the parameter for the x coordinate and
950 the y coordinate respectively
954 x_param, y_param = param
958 return x_param, y_param
962 """Determine if the data consists of boolean values"""
963 return statistics.is_binary_series(xs)
966 def is_discrete(xs, max_n_unique=None):
967 """Determine if the data consists of discrete values"""
968 return statistics.is_discrete_series(xs, max_n_unique=max_n_unique)
971 def get_exceptional_values(xs):
972 """Find exceptionally frequent values
982 A list of the found exceptional values.
984 return statistics.rice_exceptional_values(xs)
987 def get_robust_mean_and_std(xs):
988 """Does an estimation of mean and standard deviation robust against outliers.
998 Pair of mean and standard deviation
1000 x_mean = statistics.truncated_mean(xs)
1001 x_std = statistics.trimmed_std(xs)
1002 return x_mean, x_std
1005 def format_bin_label(value):
1006 """Formats a value to be placed at a tick on an axis."""
1007 if np.isfinite(value)
and value == np.round(value):
1008 return str(int(value))
1010 formated_value = f
"{value:.5g}"
1013 if len(formated_value) > 8:
1014 formated_value = f
"{value:.3e}"
1015 return formated_value
1026 outlier_z_score=None,
1027 include_exceptionals=True,
1028 allow_discrete=False,
1029 cumulation_direction=None):
1030 """Combined factory method for creating a one dimensional histogram or a profile plot."""
1034 xs = np.array(xs, copy=
False)
1037 ys = np.array(ys, copy=
False)
1039 if weights
is not None:
1040 weights = np.array(weights, copy=
False)
1042 bin_edges, bin_labels = self.determine_bin_edges(xs,
1045 lower_bound=lower_bound,
1046 upper_bound=upper_bound,
1047 outlier_z_score=outlier_z_score,
1048 include_exceptionals=include_exceptionals,
1049 allow_discrete=allow_discrete)
1051 n_bins = len(bin_edges) - 1
1052 self.lower_bound = bin_edges[0]
1053 self.upper_bound = bin_edges[-1]
1054 histogram = th1_factory(name,
'', n_bins, bin_edges)
1057 get_logger().info(
"One dimensional plot %s is discrete in x.", name)
1058 x_taxis = histogram.GetXaxis()
1059 for i_bin, bin_label
in enumerate(bin_labels):
1060 x_taxis.SetBinLabel(i_bin + 1, bin_label)
1061 self.add_stats_entry(histogram,
"dx", 1)
1064 bin_width = bin_edges[1] - bin_edges[0]
1065 self.add_stats_entry(histogram,
"dx", bin_width)
1067 self.create(histogram,
1072 cumulation_direction=cumulation_direction,
1078 for histogram
in self.histograms:
1079 x_taxis = histogram.GetXaxis()
1080 bin_edges = array.array(
"d", list(range(len(bin_labels) + 1)))
1081 x_taxis.Set(n_bins, bin_edges)
1089 cumulation_direction=None,
1090 reverse_stack=None):
1091 """Create histograms from a template, possibly stacked"""
1096 histogram = histogram_template
1097 self.fill_into(histogram, xs, ys, weights=weights)
1098 if cumulation_direction
is not None:
1099 histogram = self.cumulate(histogram, cumulation_direction=cumulation_direction)
1101 histograms.append(histogram)
1105 stackby = np.array(stackby, copy=
False)
1106 name = histogram_template.GetName()
1108 histograms = self.fill_into_grouped(histogram_template,
1113 groupby_label=
"stack")
1115 if cumulation_direction
is not None:
1116 histograms = [self.cumulate(histogram, cumulation_direction=cumulation_direction)
1117 for histogram
in histograms]
1119 plot = self.create_stack(histograms, name=name +
"_stacked", reverse_stack=reverse_stack)
1121 self.histograms = histograms
1123 self.attach_attributes()
1126 def create_stack(cls, histograms, name, reverse_stack, force_graph=False):
1127 """Create a stack of histograms"""
1128 if len(histograms) == 1:
1129 plot = histograms[0]
1131 if isinstance(histograms[0], (ROOT.TProfile, ROOT.TGraph))
or force_graph:
1132 plot = ROOT.TMultiGraph()
1134 plot = ROOT.THStack()
1141 for histogram
in reversed(histograms):
1142 if isinstance(histogram, ROOT.TProfile)
or (isinstance(histogram, ROOT.TH1)
and force_graph):
1143 histogram = cls.convert_tprofile_to_tgrapherrors(histogram)
1144 plot.Add(histogram,
"APZ")
1148 for histogram
in histograms:
1149 if isinstance(histogram, ROOT.TProfile)
or (isinstance(histogram, ROOT.TH1)
and force_graph):
1150 histogram = cls.convert_tprofile_to_tgrapherrors(histogram)
1151 plot.Add(histogram,
"APZ")
1158 def convert_tprofile_to_tgrapherrors(cls, tprofile, abs_x=False):
1159 """Extract errors from a TProfile histogram and create a TGraph from these"""
1160 if isinstance(tprofile, ROOT.TGraph):
1163 x_taxis = tprofile.GetXaxis()
1164 n_bins = x_taxis.GetNbins()
1167 bin_ids_without_underflow = list(range(1, n_bins + 1))
1169 bin_centers = np.array([x_taxis.GetBinCenter(i_bin)
for i_bin
in bin_ids_without_underflow])
1171 bin_centers = np.abs(bin_centers)
1172 bin_widths = np.array([x_taxis.GetBinWidth(i_bin)
for i_bin
in bin_ids_without_underflow])
1173 bin_x_errors = bin_widths / 2.0
1176 bin_contents = np.array([tprofile.GetBinContent(i_bin)
for i_bin
in bin_ids_without_underflow])
1177 bin_y_errors = np.array([tprofile.GetBinError(i_bin)
for i_bin
in bin_ids_without_underflow])
1179 tgrapherrors = ROOT.TGraphErrors(n_bins, bin_centers, bin_contents, bin_x_errors, bin_y_errors)
1181 tgrapherrors.GetHistogram().SetOption(
"APZ")
1183 tgrapherrors.SetLineColor(tprofile.GetLineColor())
1184 tgrapherrors.SetLineColor(tprofile.GetLineColor())
1187 for tobject
in tprofile.GetListOfFunctions():
1188 tgrapherrors.GetListOfFunctions().Add(tobject.Clone())
1191 cls.add_stats_entry(tgrapherrors,
'count', tprofile.GetEntries())
1193 stats_values = np.array([np.nan] * 6)
1194 tprofile.GetStats(stats_values)
1196 sum_w = stats_values[0]
1198 sum_wx = stats_values[2]
1199 sum_wx2 = stats_values[3]
1200 sum_wy = stats_values[4]
1201 sum_wy2 = stats_values[5]
1203 cls.add_stats_entry(tgrapherrors,
1207 cls.add_stats_entry(tgrapherrors,
1209 np.sqrt(sum_wx2 * sum_w - sum_wx * sum_wx) / sum_w)
1211 cls.add_stats_entry(tgrapherrors,
1215 cls.add_stats_entry(tgrapherrors,
1217 np.sqrt(sum_wy2 * sum_w - sum_wy * sum_wy) / sum_w)
1219 cls.add_stats_entry(tgrapherrors,
1221 tgrapherrors.GetCovariance())
1223 cls.add_stats_entry(tgrapherrors,
1225 tgrapherrors.GetCorrelationFactor())
1229 def fill_into_grouped(self,
1235 groupby_label="group"):
1236 """Fill data into similar histograms in groups indicated by a groupby array"""
1239 unique_groupbys = np.unique(groupbys)
1240 name = histogram_template.GetName()
1242 for i_value, value
in enumerate(unique_groupbys):
1244 indices_for_value = np.isnan(groupbys)
1246 indices_for_value = groupbys == value
1249 histogram_for_value = histogram_template.Clone(name +
'_' + str(value))
1250 i_root_color = i_value + 1
1252 self.set_color(histogram_for_value, i_root_color)
1255 self.add_stats_entry(histogram_for_value, groupby_label, value)
1257 self.fill_into(histogram_for_value,
1261 filter=indices_for_value)
1263 histograms.append(histogram_for_value)
1267 def set_color(self, tobject, root_i_color):
1268 """Set the color of the ROOT object.
1270 By default the line color of a TGraph should be invisible, so do not change it
1271 For other objects set the marker and the line color
1275 tobject : Plotable object inheriting from TAttLine and TAttMarker such as TGraph or TH1
1276 Object of which the color should be set.
1278 Color index of the ROOT color table
1280 if isinstance(tobject, ROOT.TGraph):
1281 tobject.SetMarkerColor(root_i_color)
1283 tobject.SetLineColor(root_i_color)
1284 tobject.SetMarkerColor(root_i_color)
1286 def fill_into(self, plot, xs, ys=None, weights=None, filter=None):
1287 """Fill the data into the plot object"""
1288 if isinstance(plot, ROOT.TGraph):
1290 raise ValueError(
"ys are required for filling a graph")
1291 self.fill_into_tgraph(plot, xs, ys, filter=filter)
1292 elif isinstance(plot, ROOT.TGraphErrors):
1294 raise ValueError(
"ys are required for filling a graph error")
1296 self.fill_into_tgrapherror(plot, xs, ys)
1298 self.fill_into_th1(plot, xs, ys, weights=weights, filter=filter)
1300 def fill_into_tgrapherror(self, graph, xs, ys, filter=None):
1301 """fill point values and error of the x and y axis into the graph"""
1303 assert (len(xs[0]) == len(ys[0]))
1305 graph.Set(len(xs[0]))
1307 for i
in range(len(xs[0])):
1308 graph.SetPoint(i, xs[0][i], ys[0][i])
1309 graph.SetPointError(i, xs[1][i], ys[1][i])
1311 def fill_into_tgraph(self, graph, xs, ys, filter=None):
1312 """Fill the data into a TGraph"""
1316 filter = slice(
None)
1326 if x_n_data > max_n_data
or y_n_data > max_n_data:
1327 get_logger().warning(f
"Number of points in scatter graph {self.name} exceed limit {max_n_data}")
1329 get_logger().warning(f
"Cropping {max_n_data}")
1331 xs = xs[0:max_n_data]
1332 ys = ys[0:max_n_data]
1334 x_axis = graph.GetXaxis()
1335 y_axis = graph.GetYaxis()
1337 x_lower_bound = x_axis.GetXmin()
1338 x_upper_bound = x_axis.GetXmax()
1340 y_lower_bound = y_axis.GetXmin()
1341 y_upper_bound = y_axis.GetXmax()
1343 x_underflow_indices = xs < x_lower_bound
1344 x_overflow_indices = xs > x_upper_bound
1346 y_underflow_indices = ys < y_lower_bound
1347 y_overflow_indices = ys > y_upper_bound
1349 plot_indices = ~(np.isnan(xs) |
1350 x_underflow_indices |
1351 x_overflow_indices |
1353 y_underflow_indices |
1356 n_plot_data = np.sum(plot_indices)
1357 plot_xs = xs[plot_indices]
1358 plot_ys = ys[plot_indices]
1360 graph.Set(int(n_plot_data))
1361 for i, (x, y)
in enumerate(zip(plot_xs, plot_ys)):
1362 graph.SetPoint(i, x, y)
1364 self.add_stats_entry(graph,
'count', np.sum(np.isfinite(xs)))
1366 self.add_nan_inf_stats(graph,
'x', xs)
1367 self.add_nan_inf_stats(graph,
'y', ys)
1369 x_n_underflow = np.sum(x_underflow_indices)
1371 self.add_stats_entry(graph,
'x underf.', x_n_underflow)
1373 x_n_overflow = np.sum(x_overflow_indices)
1375 self.add_stats_entry(graph,
'x overf.', x_n_overflow)
1377 y_n_underflow = np.sum(y_underflow_indices)
1379 self.add_stats_entry(graph,
'y underf.', y_n_underflow)
1381 y_n_overflow = np.sum(y_overflow_indices)
1383 self.add_stats_entry(graph,
'y overf.', y_n_overflow)
1385 self.add_stats_entry(graph,
'x avg', graph.GetMean(1))
1386 self.add_stats_entry(graph,
'x std', graph.GetRMS(1))
1388 self.add_stats_entry(graph,
'y avg', graph.GetMean(2))
1389 self.add_stats_entry(graph,
'y std', graph.GetRMS(2))
1391 self.add_stats_entry(graph,
'cov', graph.GetCovariance())
1392 self.add_stats_entry(graph,
'corr', graph.GetCorrelationFactor())
1394 def fill_into_th1(self, histogram, xs, ys=None, weights=None, filter=None):
1395 """Fill the histogram blocking non finite values
1399 histogram : ROOT.TH1
1400 The histogram to be filled
1401 xs : numpy.ndarray (1d)
1402 Data for the first axes
1403 ys : numpy.ndarray (1d), optional
1404 Data for the second axes
1405 weights : numpy.ndarray (1d), optional
1406 Weight of the individual points. Defaults to one for each
1407 filter : numpy.ndarray, optional
1408 Boolean index array indicating which entries shall be taken.
1412 filter = slice(
None)
1416 self.add_nan_inf_stats(histogram,
'x', xs)
1417 finite_filter = np.isfinite(xs)
1422 self.add_nan_inf_stats(histogram,
'y', ys)
1423 finite_filter &= np.isfinite(ys)
1426 xs = xs[finite_filter]
1427 weights = np.ones_like(xs)
1429 weights = weights[filter]
1430 self.add_nan_inf_stats(histogram,
'w', weights)
1431 finite_filter &= np.isfinite(weights)
1432 xs = xs[finite_filter]
1433 weights[finite_filter]
1436 ys = ys[finite_filter]
1441 except AttributeError:
1442 Fill = histogram.Fill
1444 fill = np.frompyfunc(Fill, 2, 1)
1445 fill(xs.astype(np.float64, copy=
False),
1446 weights.astype(np.float64, copy=
False))
1448 fill = np.frompyfunc(Fill, 3, 1)
1449 fill(xs.astype(np.float64, copy=
False),
1450 ys.astype(np.float64, copy=
False),
1451 weights.astype(np.float64, copy=
False))
1455 xs = xs.astype(np.float64, copy=
False)
1456 weights = weights.astype(np.float64, copy=
False)
1459 histogram.FillN(n, xs, weights)
1461 basf2.B2WARNING(
"No values to be filled into histogram: " + self.name)
1465 xs = xs.astype(np.float64, copy=
False)
1466 ys = ys.astype(np.float64, copy=
False)
1467 weights = weights.astype(np.float64, copy=
False)
1470 histogram.FillN(n, xs, ys, weights)
1472 basf2.B2WARNING(
"No values to be filled into histogram: " + self.name)
1474 self.set_additional_stats_tf1(histogram)
1477 def add_nan_inf_stats(cls, histogram, name, xs):
1478 """ Extracts the counts of non finite floats from a series
1479 and adds them as additional statistics to the histogram.
1483 histogram : derived from ROOT.TH1 or ROOT.TGraph
1484 Something having a GetListOfFunctions method that
1486 A label for the data series to be prefixed to the entries.
1487 xs : numpy.ndarray (1d)
1488 Data from which the non finit floats should be counted.
1490 n_nans = np.isnan(xs).sum()
1492 cls.add_stats_entry(histogram, name +
' nan', n_nans)
1494 n_positive_inf = np.sum(xs == np.inf)
1495 if n_positive_inf > 0:
1496 cls.add_stats_entry(histogram, name +
' pos inf', n_positive_inf)
1498 n_negative_inf = np.sum(xs == -np.inf)
1499 if n_negative_inf > 0:
1500 cls.add_stats_entry(histogram, name +
' neg inf', n_negative_inf)
1503 def add_stats_entry(cls, histogram, label, value):
1504 """Add a new additional statistics to the histogram.
1508 histogram : derived from ROOT.TH1 or ROOT.TGraph
1509 Something having a GetListOfFunctions method that holds the additional statistics
1511 Label of the statistic
1513 Value of the statistic
1515 stats_entry = StatsEntry(str(label), float(value))
1516 histogram.GetListOfFunctions().Add(stats_entry)
1517 cls.set_additional_stats_tf1(histogram)
1520 def get_additional_stats(cls, histogram):
1521 """Get the additional statistics from the histogram and return them a dict.
1525 histogram : derived from ROOT.TH1 or ROOT.TGraph
1526 Something having a GetListOfFunctions method that holds the additional statistics
1530 collection.OrderedDict
1531 A map of labels to values for the additional statistics
1533 additional_stats = collections.OrderedDict()
1534 for tobject
in histogram.GetListOfFunctions():
1535 if isinstance(tobject, StatsEntry):
1536 stats_entry = tobject
1537 label = stats_entry.GetName()
1538 value = stats_entry.GetVal()
1539 additional_stats[label] = value
1540 return additional_stats
1543 def gaus_slice_fit(cls, th2, name, z_score=None):
1544 """Extract a slice of a scatterplot and apply a Gaussian fit to it"""
1547 y_taxis = th2.GetYaxis()
1548 th2_lower_bound = y_taxis.GetXmin()
1549 th2_upper_bound = y_taxis.GetXmax()
1550 th2_height = y_taxis.GetXmax() - y_taxis.GetXmin()
1551 n_y_bins = y_taxis.GetNbins()
1553 y_mean = th2.GetMean(2)
1554 y_std = th2.GetStdDev(2)
1555 fit_lower_bound = max(th2_lower_bound, y_mean - z_score * y_std)
1556 fit_upper_bound = min(th2_upper_bound, y_mean + z_score * y_std)
1557 fit_height = fit_upper_bound - fit_lower_bound
1559 required_n_bins_inslice_filled = n_y_bins * fit_height / th2_height
1561 fit_lower_bound = th2_lower_bound
1562 fit_upper_bound = th2_upper_bound
1563 fit_height = fit_upper_bound - fit_lower_bound
1564 required_n_bins_inslice_filled = n_y_bins / 1.61
1567 required_n_bins_inslice_filled = min(required_n_bins_inslice_filled, n_y_bins / 1.61)
1569 fit_tf1 = ROOT.TF1(
"Fit",
"gaus", fit_lower_bound, fit_upper_bound)
1570 fit_tf1.SetParName(0,
"n")
1571 fit_tf1.SetParName(1,
"mean")
1572 fit_tf1.SetParName(2,
"std")
1576 param_fit_th1s = ROOT.TObjArray()
1577 th2.FitSlicesY(fit_tf1, i_first_bin, i_last_bin,
1578 int(required_n_bins_inslice_filled),
1579 fit_options, param_fit_th1s)
1581 th1_means = param_fit_th1s.At(1)
1582 th1_means.SetName(name)
1583 th1_means.SetTitle(th2.GetTitle())
1584 for label, value
in cls.get_additional_stats(th2).items():
1585 cls.add_stats_entry(th1_means, label, value)
1588 x_taxis = th2.GetXaxis()
1589 new_x_taxis = th1_means.GetXaxis()
1590 for i_bin
in range(x_taxis.GetNbins() + 2):
1591 label = x_taxis.GetBinLabel(i_bin)
1593 new_x_taxis.SetBinLabel(i_bin, label)
1596 data_lower_bound = th1_means.GetMinimum(fit_lower_bound)
1597 data_upper_bound = th1_means.GetMaximum(fit_upper_bound)
1598 data_height = data_upper_bound - data_lower_bound
1600 plot_lower_bound = max(fit_lower_bound, data_lower_bound - 0.05 * data_height)
1601 plot_upper_bound = min(fit_upper_bound, data_upper_bound + 0.05 * data_height)
1603 th1_means.SetMinimum(plot_lower_bound)
1604 th1_means.SetMaximum(plot_upper_bound)
1609 def cumulate(cls, histogram, cumulation_direction=None):
1610 """Cumulates the histogram inplace.
1614 histogram : ROOT.TH1 or ROOT.TProfile
1615 Filled histogram to be cumulated
1616 cumulation_direction : int, optional
1617 Direction is indicated by the sign.
1618 Positive means from left to right, negative means from right to left.
1619 If now cumulation direction is given return the histogram as is.
1624 Cumulated histogram potentially altered inplace.
1626 if not cumulation_direction:
1629 cumulate_backward = cumulation_direction < 0
1630 cumulate_forward =
not cumulate_backward
1632 if isinstance(histogram, ROOT.TH2):
1633 raise ValueError(
"Cannot cumulate a two dimensional histogram.")
1635 if isinstance(histogram, ROOT.TH3):
1636 raise ValueError(
"Cannot cumulate a three dimensional histogram.")
1638 if not isinstance(histogram, ROOT.TH1):
1639 raise ValueError(
"Can only cumulate a one dimensional histogram.")
1641 if isinstance(histogram, ROOT.TProfile):
1642 tprofile = histogram
1644 tgraph = cls.convert_tprofile_to_tgrapherrors(histogram)
1645 tgraph.SetName(tprofile.GetName())
1647 n_bins = histogram.GetNbinsX()
1649 cumulated_content = 0.0
1650 cumulated_entries = 0
1654 i_bins = list(range(0, n_bins + 2))
1655 if not cumulate_forward:
1656 i_bins = reversed(i_bins)
1658 for i_bin
in i_bins:
1660 bin_content = tprofile.GetBinContent(i_bin)
1661 bin_entries = tprofile.GetBinEffectiveEntries(i_bin)
1662 bin_std = tprofile.GetBinError(i_bin)
1664 if bin_entries != 0:
1665 cumulated_content = (
1666 1.0 * (cumulated_entries * cumulated_content + bin_entries * bin_content) /
1667 (cumulated_entries + bin_entries)
1671 math.hypot(cumulated_entries * cumulated_std, bin_entries * bin_std) /
1672 (cumulated_entries + bin_entries)
1675 cumulated_entries = cumulated_entries + bin_entries
1680 if i_point >= 0
and i_point < n_points:
1681 x = tgraph.GetX()[i_point]
1685 tgraph.SetPoint(i_point, x, cumulated_content)
1687 x_error = tgraph.GetErrorX(i_point)
1688 tgraph.SetPointError(i_point, x_error, cumulated_std)
1693 n_bins = histogram.GetNbinsX()
1694 cumulated_content = 0.0
1696 i_bins = list(range(0, n_bins + 2))
1697 if not cumulate_forward:
1698 i_bins = reversed(i_bins)
1700 for i_bin
in i_bins:
1701 bin_content = histogram.GetBinContent(i_bin)
1702 cumulated_content += bin_content
1703 histogram.SetBinContent(i_bin, cumulated_content)
1707 def determine_bin_edges(self,
1713 outlier_z_score=None,
1714 include_exceptionals=True,
1715 allow_discrete=False):
1716 """Deducing bin edges from a data series.
1720 xs : numpy.ndarray (1d)
1721 Data point for which a binning should be found.
1722 stackbys : numpy.ndarray (1d)
1723 Categories of the data points to be distinguishable
1724 bins : list(float) or int or None, optional
1725 Preset bin edges or preset number of desired bins.
1726 The default, None, means the bound should be extracted from data.
1727 The rice rule is used the determine the number of bins.
1728 If a list of floats is given return them immediately.
1729 lower_bound : float or None, optional
1730 Preset lower bound of the binning range.
1731 The default, None, means the bound should be extracted from data.
1732 upper_bound : float or None, optional
1733 Preset upper bound of the binning range.
1734 The default, None, means the bound should be extracted from data.
1735 outlier_z_score : float or None, optional
1736 Threshold z-score of outlier detection.
1737 The default, None, means no outlier detection.
1738 include_exceptionals : bool, optional
1739 If the outlier detection is active this switch indicates,
1740 if values detected as exceptionally frequent shall be included
1741 nevertheless into the binning range. Default is True,
1742 which means exceptionally frequent values as included
1743 even if they are detected as outliers.
1747 np.array (1d), list(str)
1748 Pair of bin edges and labels deduced from the series.
1749 Second element is None if the series is not detected as discrete.
1751 debug = get_logger().debug
1752 debug(
'Determine binning for plot named %s', self.name)
1758 elif isinstance(bins, collections.abc.Iterable):
1762 bin_edges = array.array(
'd', bin_edges)
1764 return bin_edges, bin_labels
1775 message = f
'Cannot accept n_bins={bins} as number of bins, because it is not a number greater than 0.'
1776 raise ValueError(message)
1779 xs = np.array(xs, copy=
False)
1781 if self.is_binary(xs)
or (allow_discrete
and self.is_discrete(xs)):
1783 debug(
'Discrete binning values encountered')
1784 finite_xs = xs[np.isfinite(xs)]
1785 unique_xs = np.unique(finite_xs)
1788 if lower_bound
is None:
1789 if len(unique_xs) == 0:
1790 if upper_bound
is None:
1793 lower_bound = upper_bound - 1
1795 lower_bound = np.min(unique_xs)
1797 unique_xs = unique_xs[unique_xs >= lower_bound]
1799 if upper_bound
is None:
1800 if len(unique_xs) == 0:
1801 upper_bound = lower_bound + 1
1803 upper_bound = np.min(unique_xs)
1805 unique_xs = unique_xs[unique_xs <= upper_bound]
1808 n_bins = len(unique_xs)
or 1
1810 if len(unique_xs) > 0
and n_bins >= len(unique_xs):
1812 bin_edges = array.array(
'd', unique_xs)
1813 format_bin_label = self.format_bin_label
1814 bin_labels = [format_bin_label(value)
for value
in bin_edges]
1815 bin_edges.append(bin_edges[-1] + 1)
1816 return bin_edges, bin_labels
1825 debug(
'Lower bound %s', lower_bound)
1826 debug(
'Upper bound %s', upper_bound)
1827 debug(
'N bins %s', n_bins)
1830 bin_range = self.determine_bin_range(xs,
1833 lower_bound=lower_bound,
1834 upper_bound=upper_bound,
1835 outlier_z_score=outlier_z_score,
1836 include_exceptionals=include_exceptionals)
1838 n_bins, lower_bound, upper_bound = bin_range
1840 n_bin_edges = n_bins + 1
1841 if lower_bound != upper_bound:
1843 debug(
"Creating flat distribution binning")
1844 percentiles = np.linspace(0.0, 100.0, n_bin_edges)
1845 bin_edges = np.unique(np.nanpercentile(xs[(lower_bound <= xs) & (xs <= upper_bound)], percentiles))
1849 bin_edges = np.linspace(lower_bound, upper_bound, n_bin_edges)
1854 bin_edges[0] = lower_bound
1855 bin_edges[-1] = np.nextafter(upper_bound, np.inf)
1856 debug(
'Bins %s', bin_edges)
1860 bin_edges = [lower_bound, upper_bound + 1]
1863 bin_edges = array.array(
'd', bin_edges)
1864 debug(
'Bins %s for %s', bin_edges, self.name)
1865 return bin_edges,
None
1867 def determine_bin_range(self,
1873 outlier_z_score=None,
1874 include_exceptionals=True):
1875 """Calculates the number of bins, the lower bound and the upper bound from a given data series
1876 estimating the values that are not given.
1878 If the outlier_z_score is given the method tries to exclude outliers that exceed a certain z-score.
1879 The z-score is calculated (x - x_mean) / x_std. The be robust against outliers the necessary
1880 mean and std deviation are based on truncated mean and a trimmed std calculated from the inter
1881 quantile range (IQR).
1883 If additional include_exceptionals is true the method tries to find exceptional values in the series
1884 and always include them in the range if it finds any.
1885 Exceptional values means exact values that appear often in the series for whatever reason.
1886 Possible reasons include
1887 * Integral / default values
1888 * Failed evaluation conditions
1890 which should be not cropped away automatically if you are locking on the quality of your data.
1894 xs : numpy.ndarray (1d)
1895 Data point for which a binning should be found.
1896 stackbys : numpy.ndarray (1d)
1897 Categories of the data points to be distinguishable
1898 n_bins : int or None, optional
1899 Preset number of desired bins. The default, None, means the bound should be extracted from data.
1900 The rice rule is used the determine the number of bins.
1901 lower_bound : float or None, optional
1902 Preset lower bound of the binning range.
1903 The default, None, means the bound should be extracted from data.
1904 upper_bound : float or None, optional
1905 Preset upper bound of the binning range.
1906 The default, None, means the bound should be extracted from data.
1907 outlier_z_score : float or None, optional
1908 Threshold z-score of outlier detection.
1909 The default, None, means no outlier detection.
1910 include_exceptionals : bool, optional
1911 If the outlier detection is active this switch indicates,
1912 if values detected as exceptionally frequent shall be included
1913 nevertheless into the binning range. Default is True,
1914 which means exceptionally frequent values as included
1915 even if they are detected as outliers.
1919 n_bins, lower_bound, upper_bound : int, float, float
1920 A triple of found number of bins, lower bound and upper bound of the binning range.
1923 if stackbys
is not None:
1924 unique_stackbys = np.unique(stackbys)
1926 for value
in unique_stackbys:
1928 indices_for_value = np.isnan(stackbys)
1930 indices_for_value = stackbys == value
1932 stack_lower_bound, stack_upper_bound = \
1933 self.determine_range(xs[indices_for_value],
1934 lower_bound=lower_bound,
1935 upper_bound=upper_bound,
1936 outlier_z_score=outlier_z_score,
1937 include_exceptionals=include_exceptionals)
1939 stack_ranges.append([stack_lower_bound, stack_upper_bound])
1941 lower_bound = np.nanmin([lwb
for lwb, upb
in stack_ranges])
1942 upper_bound = np.nanmax([upb
for lwb, upb
in stack_ranges])
1945 lower_bound, upper_bound = self.determine_range(xs,
1946 lower_bound=lower_bound,
1947 upper_bound=upper_bound,
1948 outlier_z_score=outlier_z_score,
1949 include_exceptionals=include_exceptionals)
1954 n_data = np.sum((lower_bound <= xs) & (xs <= upper_bound))
1955 rice_n_bins = int(statistics.rice_n_bin(n_data))
1956 n_bins = rice_n_bins
1959 n_bins = int(n_bins)
1962 message = f
'Cannot accept n_bins={n_bins} as number of bins, because it is not a number greater than 0.'
1963 raise ValueError(message)
1965 return n_bins, lower_bound, upper_bound
1967 def determine_range(self,
1971 outlier_z_score=None,
1972 include_exceptionals=True):
1976 xs : numpy.ndarray (1d)
1977 Data point for which a binning should be found.
1978 lower_bound : float or None, optional
1979 Preset lower bound of the binning range.
1980 The default, None, means the bound should be extracted from data.
1981 upper_bound : float or None, optional
1982 Preset upper bound of the binning range.
1983 The default, None, means the bound should be extracted from data.
1984 outlier_z_score : float or None, optional
1985 Threshold z-score of outlier detection.
1986 The default, None, means no outlier detection.
1987 include_exceptionals : bool, optional
1988 If the outlier detection is active this switch indicates,
1989 if values detected as exceptionally frequent shall be included
1990 nevertheless into the binning range. Default is True,
1991 which means exceptionally frequent values as included
1992 even if they are detected as outliers.
1996 lower_bound, upper_bound : float, float
1997 A pair of found lower bound and upper bound of series.
1999 debug = get_logger().debug
2001 finite_xs_indices = np.isfinite(xs)
2002 if np.any(finite_xs_indices):
2003 finite_xs = xs[finite_xs_indices]
2007 make_symmetric =
False
2008 exclude_outliers = outlier_z_score
is not None and (lower_bound
is None or upper_bound
is None)
2011 if include_exceptionals
or exclude_outliers:
2012 exceptional_xs = self.get_exceptional_values(finite_xs)
2013 exceptional_indices = np.in1d(finite_xs, exceptional_xs)
2016 if exclude_outliers:
2017 if not np.all(exceptional_indices):
2020 x_mean, x_std = self.get_robust_mean_and_std(finite_xs[~exceptional_indices])
2022 x_mean, x_std = np.nan, np.nan
2024 make_symmetric = abs(x_mean) < x_std / 5.0
and lower_bound
is None and upper_bound
is None
2026 if include_exceptionals
and len(exceptional_xs) != 0:
2027 lower_exceptional_x = np.min(exceptional_xs)
2028 upper_exceptional_x = np.max(exceptional_xs)
2029 make_symmetric =
False
2031 lower_exceptional_x = np.nan
2032 upper_exceptional_x = np.nan
2035 if lower_bound
is None:
2037 lower_bound = np.min(finite_xs)
2041 if outlier_z_score
is not None:
2043 lower_outlier_bound = x_mean - outlier_z_score * x_std
2047 indices_above_lower_outlier_bound = finite_xs >= lower_outlier_bound
2049 if np.any(indices_above_lower_outlier_bound):
2050 lower_bound = np.min(finite_xs[indices_above_lower_outlier_bound])
2053 lower_bound = np.nanmin([lower_bound, lower_exceptional_x])
2055 debug(
'Lower bound after outlier detection')
2056 debug(
'Lower bound %s', lower_bound)
2057 debug(
'Lower outlier bound %s', lower_outlier_bound)
2060 if upper_bound
is None:
2062 upper_bound = np.max(finite_xs)
2065 if outlier_z_score
is not None:
2067 upper_outlier_bound = x_mean + outlier_z_score * x_std
2071 indices_below_upper_outlier_bound = finite_xs <= upper_outlier_bound
2073 if np.any(indices_below_upper_outlier_bound):
2074 upper_bound = np.max(finite_xs[indices_below_upper_outlier_bound])
2077 upper_bound = np.nanmax([upper_bound, upper_exceptional_x])
2079 debug(
'Upper bound after outlier detection')
2080 debug(
'Upper bound %s', upper_bound)
2081 debug(
'Upper outlier bound %s', upper_outlier_bound)
2083 if make_symmetric
and lower_bound < 0
and upper_bound > 0:
2084 if abs(abs(lower_bound) - abs(upper_bound)) < x_std / 5.0:
2085 abs_bound = max(abs(lower_bound), abs(upper_bound))
2086 lower_bound = -abs_bound
2087 upper_bound = abs_bound
2089 return lower_bound, upper_bound
2092 def set_additional_stats_tf1(cls, histogram):
2093 """Combining fit TF1 with the additional statistics and attach them to the histogram.
2097 histogram : ROOT.TH1 or ROOT.TGraph or ROOT.TMultiGraph
2098 Something having a GetListOfFunctions method that should hold
2099 the combined fit and additional statistics function.
2101 additional_stats_tf1 = cls.create_additional_stats_tf1(histogram)
2102 cls.set_tf1(histogram, additional_stats_tf1)
2105 def set_fit_tf1(cls, histogram, fit_tf1):
2106 """Combining fit TF1 with the additional statistics and attach them to the histogram.
2110 histogram : ROOT.TH1 or ROOT.TGraph or ROOT.TMultiGraph
2111 Something having a GetListOfFunctions method that should hold
2112 the combined fit and additional statistics function.
2114 additional_stats_tf1 = cls.create_additional_stats_tf1(histogram)
2115 combined_tf1 = cls.combine_fit_and_additional_stats(fit_tf1, additional_stats_tf1)
2116 cls.set_tf1(histogram, combined_tf1)
2119 def set_tf1(cls, histogram, tf1):
2120 """Set the attached TF1 of the histogram.
2124 histogram : ROOT.TH1 or ROOT.TGraph or ROOT.TMultiGraph
2125 Something having a GetListOfFunctions method that should hold
2126 the combined fit and additional statistics function.
2129 cls.delete_tf1(histogram)
2131 tf1.SetName(
"FitAndStats")
2132 histogram.GetListOfFunctions().Add(tf1)
2135 def delete_tf1(cls, histogram):
2136 """Delete the attached TF1 from the histogram
2140 histogram : ROOT.TH1 or ROOT.TGraph
2141 Something having a GetListOfFunctions method that holds the fit function
2143 tf1 = histogram.FindObject(
"FitAndStats")
2145 function_list = histogram.GetListOfFunctions()
2146 function_list.Remove(tf1)
2149 def create_additional_stats_tf1(cls, histogram):
2150 """Create a TF1 with the additional statistics from the histogram as parameters.
2154 histogram : ROOT.TH1 or ROOT.TGraph
2155 Something having a GetListOfFunctions method that holds the additional statistics.
2160 Function with the additional statistics as parameters.
2163 additional_stats = cls.get_additional_stats(histogram)
2164 if not additional_stats:
2174 formula_string =
'+'.join(
'0*[' + str(i) +
']' for i
in range(len(additional_stats)))
2177 additional_stats_tf1 = ROOT.TF1(
"Stats", formula_string, lower_bound, upper_bound)
2179 for (i, (label, value))
in enumerate(additional_stats.items()):
2183 label = label.replace(
" ",
"-")
2184 additional_stats_tf1.SetParName(i, label)
2185 additional_stats_tf1.FixParameter(i, value)
2187 return additional_stats_tf1
2190 def combine_fit_and_additional_stats(cls, fit_tf1, additional_stats_tf1):
2191 """Combine the fit function and the function carrying the additional statistics to one function.
2197 additional_stats_tf1 : ROOT.TF1
2198 The function carrying the additional statistics as parameters
2204 if additional_stats_tf1
is None:
2210 lower_bound = ctypes.c_double()
2211 upper_bound = ctypes.c_double()
2212 fit_tf1.GetRange(lower_bound, upper_bound)
2213 title = fit_tf1.GetTitle()
2215 combined_formula = additional_stats_tf1.GetExpFormula().Data() +
'+' + fit_tf1.GetExpFormula().Data()
2216 combined_tf1 = ROOT.TF1(
"Combined", combined_formula, lower_bound.value, upper_bound.value)
2217 combined_tf1.SetTitle(title)
2220 chi2 = fit_tf1.GetChisquare()
2221 combined_tf1.SetChisquare(chi2)
2223 ndf = fit_tf1.GetNDF()
2224 combined_tf1.SetNDF(ndf)
2226 n_stats_parameters = additional_stats_tf1.GetNpar()
2228 cls.copy_tf1_parameters(additional_stats_tf1, combined_tf1)
2229 cls.copy_tf1_parameters(fit_tf1, combined_tf1, offset=n_stats_parameters)
2234 def copy_tf1_parameters(cls, tf1_source, tf1_target, offset=0):
2235 """Copy the parameters of one TF1 to another.
2239 tf1_source : ROOT.TF1
2240 Take parameters from here
2241 tf1_target : ROOT.TF1
2243 offset : int, optional
2244 Index of the first target parameter to which to copy.
2246 n_parameters = tf1_source.GetNpar()
2249 lower_bound = ctypes.c_double()
2250 upper_bound = ctypes.c_double()
2252 for i_source
in range(n_parameters):
2253 parameter_name = tf1_source.GetParName(i_source)
2254 i_target = tf1_target.GetParNumber(parameter_name)
2258 for i_target
in range(tf1_target.GetNpar()):
2259 if parameter_name == tf1_target.GetParName(i_target):
2265 tf1_target.SetParameter(i_target,
2266 tf1_source.GetParameter(i_source))
2267 tf1_target.SetParError(i_target,
2268 tf1_source.GetParError(i_source))
2270 tf1_source.GetParLimits(i_source, lower_bound, upper_bound)
2271 tf1_target.SetParLimits(i_target, lower_bound.value, upper_bound.value)
2273 def attach_attributes(self):
2274 """Reassign the special attributes of the plot forwarding them to the ROOT plot."""
2277 self.check = self.check
2279 self.contact = self.contact
2281 self.description = self.description
2284 self.xlabel = self.xlabel
2286 self.ylabel = self.ylabel
2288 self.title = self.title
2290 def set_maximum(self, maximum):
2291 """Sets the maximum of the vertical plotable range"""
2292 for histogram
in self.histograms:
2293 if isinstance(histogram, ROOT.TH1):
2294 histogram.SetMaximum(histogram.GetMaximum(maximum))
2296 histogram.SetMaximum(maximum)
2298 def set_minimum(self, minimum):
2299 """Sets the minimum of the vertical plotable range"""
2300 for histogram
in self.histograms:
2301 if isinstance(histogram, ROOT.TH1):
2302 histogram.SetMinimum(histogram.GetMinimum(minimum))
2304 histogram.SetMinimum(minimum)
2307 def set_tstyle(cls):
2308 """Set the style such that the additional stats entries are shown by the TBrowser"""
2309 belle2_validation_style_name = "belle2_validation_style"
2310 belle2_validation_tstyle = ROOT.gROOT.GetStyle(belle2_validation_style_name)
2311 if not belle2_validation_tstyle:
2312 belle2_validation_tstyle = ROOT.TStyle(belle2_validation_style_name, belle2_validation_style_name)
2315 belle2_validation_tstyle.SetOptFit(opt_fit)
2318 belle2_validation_tstyle.SetOptStat(opt_stat)
2319 ROOT.gROOT.SetStyle(belle2_validation_style_name)
2321 # belle2_validation_tstyle.SetLineStyleString(cls.very_sparse_dots_line_style_index, "4 2000")
2324 belle2_validation_tstyle.cd()
2328 """Simple test method"""
2329 ValidationPlot.set_tstyle()
2331 # Test a histogram plot with some nan and inf values
2332 normal_distributed_values = np.random.randn(1000)
2335 normal_distributed_values[i] = np.nan
2337 for i in range(10, 20):
2338 normal_distributed_values[i] = np.inf
2340 for i in range(20, 30):
2341 normal_distributed_values[i] = -np.inf
2343 validation_histogram = ValidationPlot('test_hist')
2344 validation_histogram.hist(normal_distributed_values)
2345 validation_histogram.title = 'A normal distribution'
2346 validation_histogram.xlabel = 'normal'
2347 validation_histogram.ylabel = 'frequency'
2348 validation_histogram.fit_gaus()
2350 # Test for a cumulated histogram - cumulation from left to right
2351 cumulated_histogram = ValidationPlot('test_cum_hist')
2352 cumulated_histogram.hist(normal_distributed_values, cumulation_direction=1)
2353 cumulated_histogram.title = 'A cumulated normal distribution'
2354 cumulated_histogram.xlabel = 'normal'
2355 cumulated_histogram.ylabel = 'cdf'
2356 cumulated_histogram.show()
2358 # Test stacked histogram
2359 # Make a random selection of 40%
2360 stackby = np.random.binomial(1.0, 0.40, 1000)
2361 stacked_validation_histogram = ValidationPlot('test_stacked_hist')
2362 stacked_validation_histogram.hist(normal_distributed_values, stackby=stackby)
2364 # Make a scatterplot with two species.
2365 x = np.random.randn(1000)
2366 y = 3 * np.random.randn(1000)
2367 ones = np.ones_like(x)
2370 x1 = np.where(stackby != 0, np.cos(angle) * ones, ones) * x + np.where(stackby != 0, np.sin(angle) * ones, ones) * y
2371 y1 = np.where(stackby != 0, np.sin(angle) * ones, ones) * x - np.where(stackby != 0, np.cos(angle) * ones, ones) * y
2373 stacked_validation_scatter = ValidationPlot('test_stacked_scatter')
2374 stacked_validation_scatter.scatter(x1, y1, stackby=stackby)
2376 # Make a profile plot with the two species
2377 stacked_validation_profile = ValidationPlot('test_stacked_profile')
2378 stacked_validation_profile.profile(x1, y1, stackby=stackby)
2380 # Make a two dimensional histogram with two species
2381 stacked_validation_hist2d = ValidationPlot('test_stacked_hist2d')
2382 stacked_validation_hist2d.hist2d(x1, y1, stackby=stackby)
2384 # Test a profile with a diagonal fit
2385 x = np.linspace(-1, 1, 1000)
2388 diagonal_plot = ValidationPlot('test_diag')
2389 diagonal_plot.profile(x, y, bins=50)
2390 diagonal_plot.fit_line()
2392 # Test if cumulation works for profile plots
2393 cumulated_profile = ValidationPlot('test_cum_profile')
2394 cumulated_profile.profile(x, y, bins=50, cumulation_direction=1)
2396 tfile = ROOT.TFile('test.root', 'RECREATE')
2398 validation_histogram.write(tfile)
2400 with root_cd("expert") as tdirectory1:
2401 diagonal_plot.write(tdirectory1)
2402 cumulated_profile.write(tdirectory1)
2403 cumulated_histogram.write(tdirectory1)
2405 with root_cd("stacked") as tdirectory2:
2406 stacked_validation_histogram.write(tdirectory2)
2407 stacked_validation_scatter.write()
2408 stacked_validation_profile.write()
2409 stacked_validation_hist2d.write()
2413 tfile = ROOT.TFile('test.root')
2414 tBrowser = ROOT.TBrowser()
2415 tBrowser.BrowseObject(tfile)
2420if __name__ == '__main__':