2085 def process(self):
2086 """
2087 Use basf2_mva teacher to create MVA weightfile from collected training
2088 data variables.
2089
2090 Main process that is dispatched by the ``run`` method that is inherited
2091 from ``Basf2Task``.
2092 """
2093
2094 validation_harvest_basename = self.harvesting_validation_task_instance.validation_output_file_name
2095 validation_harvest_path = self.get_input_file_names(validation_harvest_basename)[0]
2096
2097
2098 pr_columns = [
2099 'is_fake', 'is_clone', 'is_matched', 'quality_indicator',
2100 'experiment_number', 'run_number', 'event_number', 'pr_store_array_number',
2101 'pt_estimate', 'z0_estimate', 'd0_estimate', 'tan_lambda_estimate',
2102 'phi0_estimate', 'pt_truth', 'z0_truth', 'd0_truth', 'tan_lambda_truth',
2103 'phi0_truth',
2104 ]
2105
2106 pr_df = uproot.open(validation_harvest_path)['pr_tree/pr_tree'].arrays(pr_columns, library='pd')
2107 mc_columns = [
2108 'experiment_number',
2109 'run_number',
2110 'event_number',
2111 'pr_store_array_number',
2112 'is_missing',
2113 'is_primary',
2114 ]
2115
2116 mc_df = uproot.open(validation_harvest_path)['mc_tree/mc_tree'].arrays(mc_columns, library='pd')
2117 if self.primaries_only:
2118 mc_df = mc_df[mc_df.is_primary.eq(True)]
2119
2120
2121 qi_cuts = np.linspace(0., 1, 20, endpoint=False)
2122
2123
2124
2125
2126
2127 output_pdf_file_path = self.get_output_file_name(self.output_pdf_file_basename)
2128 with PdfPages(output_pdf_file_path, keep_empty=False) as pdf:
2129
2130
2131
2132
2133 titlepage_fig, titlepage_ax = plt.subplots()
2134 titlepage_ax.axis("off")
2135 title = f"Quality Estimator validation plots from {self.__class__.__name__}"
2136 titlepage_ax.set_title(title)
2137 teacher_task = self.harvesting_validation_task_instance.teacher_task
2138 weightfile_identifier = teacher_task.get_weightfile_xml_identifier(teacher_task, fast_bdt_option=self.fast_bdt_option)
2139 meta_data = {
2140 "Date": datetime.today().strftime("%Y-%m-%d %H:%M"),
2141 "Created by steering file": os.path.realpath(__file__),
2142 "Created from data in": validation_harvest_path,
2143 "Background directory": MasterTask.bkgfiles_by_exp[self.experiment_number],
2144 "weight file": weightfile_identifier,
2145 }
2146 if hasattr(self, 'exclude_variables'):
2147 meta_data["Excluded variables"] = ", ".join(self.exclude_variables)
2148 meta_data_string = (format_dictionary(meta_data) +
2149 "\n\n(For all MVA training parameters look into the produced weight file)")
2150 luigi_params = get_serialized_parameters(self)
2151 luigi_param_string = (f"\n\nb2luigi parameters for {self.__class__.__name__}\n" +
2152 format_dictionary(luigi_params))
2153 title_page_text = meta_data_string + luigi_param_string
2154 titlepage_ax.text(0, 1, title_page_text, ha="left", va="top", wrap=True, fontsize=8)
2155 pdf.savefig(titlepage_fig)
2156 plt.close(titlepage_fig)
2157
2158 fake_rates = get_uncertain_means_for_qi_cuts(pr_df, "is_fake", qi_cuts)
2159 fake_fig, fake_ax = plt.subplots()
2160 fake_ax.set_title("Fake rate")
2161 plot_with_errobands(fake_rates, ax=fake_ax)
2162 fake_ax.set_ylabel("fake rate")
2163 fake_ax.set_xlabel("quality indicator requirement")
2164 pdf.savefig(fake_fig, bbox_inches="tight")
2165 plt.close(fake_fig)
2166
2167
2168 clone_rates = get_uncertain_means_for_qi_cuts(pr_df, "is_clone", qi_cuts)
2169 clone_fig, clone_ax = plt.subplots()
2170 clone_ax.set_title("Clone rate")
2171 plot_with_errobands(clone_rates, ax=clone_ax)
2172 clone_ax.set_ylabel("clone rate")
2173 clone_ax.set_xlabel("quality indicator requirement")
2174 pdf.savefig(clone_fig, bbox_inches="tight")
2175 plt.close(clone_fig)
2176
2177
2178
2179
2180
2181
2182 pr_track_identifiers = ['experiment_number', 'run_number', 'event_number', 'pr_store_array_number']
2183 mc_df = upd.merge(
2184 left=mc_df, right=pr_df[pr_track_identifiers + ['quality_indicator']],
2185 how='left',
2186 on=pr_track_identifiers
2187 )
2188
2189 missing_fractions = (
2190 _my_uncertain_mean(mc_df[
2191 mc_df.quality_indicator.isnull() | (mc_df.quality_indicator > qi_cut)]['is_missing'])
2192 for qi_cut in qi_cuts
2193 )
2194
2195 findeff_fig, findeff_ax = plt.subplots()
2196 findeff_ax.set_title("Finding efficiency")
2197 finding_efficiencies = 1.0 - upd.Series(data=missing_fractions, index=qi_cuts)
2198 plot_with_errobands(finding_efficiencies, ax=findeff_ax)
2199 findeff_ax.set_ylabel("finding efficiency")
2200 findeff_ax.set_xlabel("quality indicator requirement")
2201 pdf.savefig(findeff_fig, bbox_inches="tight")
2202 plt.close(findeff_fig)
2203
2204
2205
2206
2207 fake_roc_fig, fake_roc_ax = plt.subplots()
2208 fake_roc_ax.set_title("Fake rate vs. finding efficiency ROC curve")
2209 fake_roc_ax.errorbar(x=finding_efficiencies.nominal_value, y=fake_rates.nominal_value,
2210 xerr=finding_efficiencies.std_dev, yerr=fake_rates.std_dev, elinewidth=0.8)
2211 fake_roc_ax.set_xlabel('finding efficiency')
2212 fake_roc_ax.set_ylabel('fake rate')
2213 pdf.savefig(fake_roc_fig, bbox_inches="tight")
2214 plt.close(fake_roc_fig)
2215
2216
2217 clone_roc_fig, clone_roc_ax = plt.subplots()
2218 clone_roc_ax.set_title("Clone rate vs. finding efficiency ROC curve")
2219 clone_roc_ax.errorbar(x=finding_efficiencies.nominal_value, y=clone_rates.nominal_value,
2220 xerr=finding_efficiencies.std_dev, yerr=clone_rates.std_dev, elinewidth=0.8)
2221 clone_roc_ax.set_xlabel('finding efficiency')
2222 clone_roc_ax.set_ylabel('clone rate')
2223 pdf.savefig(clone_roc_fig, bbox_inches="tight")
2224 plt.close(clone_roc_fig)
2225
2226
2227
2228
2229 kinematic_qi_cuts = [0, 0.5, 0.9]
2230
2231
2232
2233 params = ['d0', 'z0', 'pt', 'tan_lambda', 'phi0']
2234 label_by_param = {
2235 "pt": "$p_T$",
2236 "z0": "$z_0$",
2237 "d0": "$d_0$",
2238 "tan_lambda": r"$\tan{\lambda}$",
2239 "phi0": r"$\phi_0$"
2240 }
2241 unit_by_param = {
2242 "pt": "GeV",
2243 "z0": "cm",
2244 "d0": "cm",
2245 "tan_lambda": "rad",
2246 "phi0": "rad"
2247 }
2248 n_kinematic_bins = 75
2249 bins_by_param = {
2250 "pt": np.linspace(0, np.percentile(pr_df['pt_truth'].dropna(), 95), n_kinematic_bins),
2251 "z0": np.linspace(-0.1, 0.1, n_kinematic_bins),
2252 "d0": np.linspace(0, 0.01, n_kinematic_bins),
2253 "tan_lambda": np.linspace(-2, 3, n_kinematic_bins),
2254 "phi0": np.linspace(0, 2 * np.pi, n_kinematic_bins)
2255 }
2256
2257
2258 kinematic_qi_cuts = [0, 0.5, 0.8]
2259 blue, yellow, green = plt.get_cmap("tab10").colors[0:3]
2260 for param in params:
2261 fig, axarr = plt.subplots(ncols=len(kinematic_qi_cuts), sharey=True, sharex=True, figsize=(14, 6))
2262 fig.suptitle(f"{label_by_param[param]} distributions")
2263 for i, qi in enumerate(kinematic_qi_cuts):
2264 ax = axarr[i]
2265 ax.set_title(f"QI > {qi}")
2266 incut = pr_df[(pr_df['quality_indicator'] > qi)]
2267 incut_matched = incut[incut.is_matched.eq(True)]
2268 incut_clones = incut[incut.is_clone.eq(True)]
2269 incut_fake = incut[incut.is_fake.eq(True)]
2270
2271
2272 if any(series.empty for series in (incut, incut_matched, incut_clones, incut_fake)):
2273 ax.text(0.5, 0.5, "Not enough data in bin", ha="center", va="center", transform=ax.transAxes)
2274 continue
2275
2276 bins = bins_by_param[param]
2277 stacked_histogram_series_tuple = (
2278 incut_matched[f'{param}_estimate'],
2279 incut_clones[f'{param}_estimate'],
2280 incut_fake[f'{param}_estimate'],
2281 )
2282 histvals, _, _ = ax.hist(stacked_histogram_series_tuple,
2283 stacked=True,
2284 bins=bins, range=(bins.min(), bins.max()),
2285 color=(blue, green, yellow),
2286 label=("matched", "clones", "fakes"))
2287 ax.set_xlabel(f'{label_by_param[param]} estimate / ({unit_by_param[param]})')
2288 ax.set_ylabel('# tracks')
2289 axarr[0].legend(loc="upper center", bbox_to_anchor=(0, -0.15))
2290 pdf.savefig(fig, bbox_inches="tight")
2291 plt.close(fig)
2292
2293