13 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
14 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
15 LPNs, like those which can be downloaded from the dataset searcher.
17 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
18 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
22 .. rubric:: Example usage
24 * Convert list of BGx1 MCri LPNs into YAML format and print to screen::
26 $ %(prog)s my_MCri_LPNs.txt --mcri --bg BGx1
28 * Convert list of BGx1 MCrd LPNs into YAML format and print to screen::
30 $ %(prog)s my_MCrd_LPNs.txt --mcrd --bg BGx1
32 * Convert list of data LPNs into YAML format and save to file::
34 $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
40 from pathlib
import Path
44 from skim.utils.testfiles
import DataSample, MCSample
49 def get_argument_parser():
50 description, epilog = __doc__.split(
"--epilog--")
51 parser = argparse.ArgumentParser(
52 description=description,
53 formatter_class=argparse.RawDescriptionHelpFormatter,
57 metavar=
"input_lpn_list_file",
59 help=
"Input file containing list of LPNs (such as that from the dataset searcher).",
63 metavar=
"output_filename",
65 help=
"Output YAML file name. If none given, prints output to screen.",
67 DataMCGroup = parser.add_mutually_exclusive_group(required=
True)
68 DataMCGroup.add_argument(
69 "--data", action=
"store_true", help=
"Flag to indicate the LPNs are for data."
71 DataMCGroup.add_argument(
72 "--mcri", action=
"store_true", help=
"Flag to indicate the LPNs are for run-independent MC."
74 DataMCGroup.add_argument(
75 "--mcrd", action=
"store_true", help=
"Flag to indicate the LPNs are for run-dependent MC."
80 choices=(
"BGx0",
"BGx1"),
81 required=(
"--mcri" in sys.argv
or "--mcrd" in sys.argv),
82 help=
"Beam background level of MC samples. Only required for MC.",
87 def verify_dataframe(df, mcri):
90 all(df[
"release"].str.startswith(
"release-"))
91 and all(df[
"DBGT"].str.startswith(
"DB"))
92 and all(df[
"expNumber"].str.startswith(
"e"))
93 and all(df[
"runNumber"].str.startswith(
"r"))
96 "The column values don't seem to line up with what's expected."
100 if not all(df[
"dataLevel"].str.match(
"mdst")):
101 raise ValueError(
"Input LPNs must all be mdst.")
102 if "generalSkimName" in df.columns
and len(set(df[
"generalSkimName"])) > 1:
103 raise ValueError(
"More than one GeneralSkimName in input data LPNs.")
104 if mcri
and len(set(df[
"runNumber"])) > 1:
105 raise ValueError(
"More than one run number listed for MC LPNs.")
108 def to_yaml(data, output=None):
109 """Print to screen or file as YAML format."""
110 string = yaml.dump(data, sort_keys=
False)
113 warning =
"# TODO: Ensure this label matches a sample label in SkimStats.json"
114 string = re.sub(
"(sampleLabel.*)$", f
"\\1 {warning}", string, flags=re.MULTILINE)
116 if isinstance(output, (str, Path)):
117 OutputFilename = Path(output).with_suffix(
".yaml")
118 with open(OutputFilename,
"w")
as f:
120 print(f
"Wrote YAML file to {OutputFilename}", file=sys.stderr)
126 parser = get_argument_parser()
127 args = parser.parse_args()
130 with open(args.input)
as f:
131 LPNs = sorted(
filter(
None, f.read().split(
"\n")))
134 prefixes = {re.match(
r"^(.*)(/release)", LPN)[1]
for LPN
in LPNs}
135 if len(prefixes) > 1:
136 raise ValueError(
"Somehow got multiple different prefixes!")
138 prefix = list(prefixes)[0]
139 LPNs = [re.sub(
r"^.*(release)",
r"\1", LPN)
for LPN
in LPNs]
142 df = pd.DataFrame([Path(LPN).parts
for LPN
in LPNs])
144 if len(df.columns) == 8:
160 elif len(df.columns) == 9:
196 df.rename(columns=columns, inplace=
True)
197 verify_dataframe(df, args.mcri)
199 if args.mcri
or args.mcrd:
200 df.loc[:,
"expNumber"] = df[
"s00"] +
"/" + df[
"expNumber"]
201 df.drop(
"s00", axis=1, inplace=
True)
207 ExpGroups = df.groupby([
"campaign",
"expNumber"])
208 for (campaign, expNumber), ExpGroup
in ExpGroups:
209 groups = ExpGroup.groupby([
"beamEnergy",
"release",
"DBGT",
"prodNumber"])
210 for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group))
in enumerate(
214 prodNumber = int(re.sub(
r"^prod0*",
"", prodNumber))
215 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
216 expInteger = int(re.sub(
r"^e0*",
"", expNumber))
219 onres = beamEnergy ==
"4S"
221 label = f
"{campaign}_exp{expInteger}r{iGroup+1}"
223 label = f
"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
225 if "generalSkimName" in df.columns:
226 generalSkim = list(group[
"generalSkimName"])[0]
231 sampleLabel = DataSample(
232 location=
"DUMMY_PATH",
234 experiment=expInteger,
235 beam_energy=beamEnergy,
236 general_skim=generalSkim,
240 DataBlocks[label] = {
241 "sampleLabel": sampleLabel,
243 "inputReleaseNumber": release,
244 "prodNumber": prodNumber,
245 "inputDBGlobalTag": DBGT,
246 "procNumber": campaign,
247 "experimentNumber": expNumber,
248 "beamEnergy": beamEnergy,
249 "inputDataLevel":
"mdst",
250 "runNumbers": list(group[
"runNumber"]),
253 if "generalSkimName" in df.columns:
254 DataBlocks[label][
"generalSkimName"] = list(
255 group[
"generalSkimName"]
258 ExpGroups = df.groupby([
"campaign",
"MCEventType",
"expNumber"])
259 for (campaign, MCEventType, expNumber), ExpGroup
in ExpGroups:
260 groups = ExpGroup.groupby([
"beamEnergy",
"release",
"DBGT",
"prodNumber"])
261 for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group))
in enumerate(
265 prodNumber = int(re.sub(
r"^prod0*",
"", prodNumber))
266 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
267 expInteger = int(re.sub(
r"^s00/e0*",
"", expNumber))
270 onres = beamEnergy ==
"4S"
272 label = f
"{campaign}_exp{expInteger}r{iGroup+1}"
274 label = f
"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
277 DataBlocks[label] = {
278 "sampleLabel": (f
"MC-{campaign}-{beamEnergy}-{MCEventType}-{args.bg}"),
280 "inputReleaseNumber": release,
281 "mcCampaign": campaign,
282 "prodNumber": prodNumber,
283 "inputDBGlobalTag": DBGT,
284 "experimentNumber": expNumber,
285 "beamEnergy": beamEnergy,
286 "mcType": MCEventType,
287 "mcBackground": args.bg,
288 "inputDataLevel":
"mdst",
289 "runNumbers": list(group[
"runNumber"]),
293 df.loc[:,
"prodNumber"] = (
294 df[
"prodNumber"].str.replace(
"^prod0*",
"", regex=
True).astype(int)
297 MCTypeGroups = df.groupby([
"campaign",
"MCEventType"])
298 for (campaign, MCEventType), MCTypeGroup
in MCTypeGroups:
299 groups = MCTypeGroup.groupby(
300 [
"beamEnergy",
"expNumber",
"release",
"DBGT",
"runNumber"]
304 ((beamEnergy, expNumber, release, DBGT, runNumber), group),
305 )
in enumerate(groups):
307 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
310 label = f
"{campaign}_{MCEventType}{args.bg}"
311 BlockLabel = f
"{label}r{iGroup+1}"
314 sampleLabel = MCSample(
315 location=
"DUMMY_PATH",
318 beam_energy=beamEnergy,
319 beam_background=args.bg,
323 DataBlocks[BlockLabel] = {
324 "sampleLabel": sampleLabel,
326 "inputReleaseNumber": release,
327 "mcCampaign": campaign,
328 "prodNumber": list(group[
"prodNumber"]),
329 "inputDBGlobalTag": DBGT,
330 "experimentNumber": expNumber,
331 "beamEnergy": beamEnergy,
332 "mcType": MCEventType,
333 "mcBackground": args.bg,
334 "inputDataLevel":
"mdst",
335 "runNumbers": runNumber,
338 to_yaml(DataBlocks, args.output)
341 if __name__ ==
"__main__":
std::map< ExpRun, std::pair< double, double > > filter(const std::map< ExpRun, std::pair< double, double >> &runs, double cut, std::map< ExpRun, std::pair< double, double >> &runsRemoved)
filter events to remove runs shorter than cut, it stores removed runs in runsRemoved
int main(int argc, char **argv)
Run all tests.