5 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
6 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
7 LPNs, like those which can be downloaded from the dataset searcher.
9 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
10 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
14 .. rubric:: Example usage
16 * Convert list of BGx1 MC LPNs into YAML format and print to screen::
18 $ %(prog)s my_MC_LPNs.txt --mc --bg BGx1
20 * Convert list of data LPNs into YAML format and save to file::
22 $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
28 from pathlib
import Path
32 from termcolor
import colored
36 __author__ =
"Phil Grace"
39 def get_argument_parser():
40 description, epilog = __doc__.split(
"--epilog--")
41 parser = argparse.ArgumentParser(
42 description=description,
43 formatter_class=argparse.RawDescriptionHelpFormatter,
47 metavar=
"input_lpn_list_file",
49 help=
"Input file containing list of LPNs (such as that from the dataset searcher).",
53 metavar=
"output_filename",
55 help=
"Output YAML file name. If none given, prints output to screen.",
57 DataMCGroup = parser.add_mutually_exclusive_group(required=
True)
58 DataMCGroup.add_argument(
59 "--data", action=
"store_true", help=
"Flag to indicate the LPNs are for data."
61 DataMCGroup.add_argument(
62 "--mc", action=
"store_true", help=
"Flag to indicate the LPNs are for MC."
67 choices=(
"BGx0",
"BGx1"),
68 required=(
"--mc" in sys.argv),
69 help=
"Beam background level of MC samples. Only required for MC.",
74 def verify_dataframe(df, mc):
77 all(df[
"release"].str.startswith(
"release-"))
78 and all(df[
"DBGT"].str.startswith(
"DB"))
79 and all(df[
"expNumber"].str.startswith(
"e"))
80 and all(df[
"runNumber"].str.startswith(
"r"))
83 "The column values don't seem to line up with what's expected."
87 if not all(df[
"dataLevel"].str.match(
"mdst")):
88 raise ValueError(
"Input LPNs must all be mdst.")
89 if mc
and len(set(df[
"runNumber"])) > 1:
90 raise ValueError(
"More than one run number listed for MC LPNs.")
91 if len(set(df[
"beamEnergy"])) > 1:
95 "[WARNING] Input LPNs contain more than one beam energy.",
"yellow"
101 def to_yaml(data, output=None):
102 """Print to screen or file as YAML format."""
103 if isinstance(output, (str, Path)):
104 OutputFilename = Path(output).with_suffix(
".yaml")
105 with open(OutputFilename,
"w")
as f:
106 yaml.dump(data, f, sort_keys=
False)
107 print(f
"[INFO] Wrote YAML file to {OutputFilename}", file=sys.stderr)
109 yaml.dump(data, sys.stdout, sort_keys=
False)
113 parser = get_argument_parser()
114 args = parser.parse_args()
117 with open(args.input)
as f:
118 LPNs = sorted(
filter(
None, f.read().split(
"\n")))
121 prefixes = {re.match(
r"^(.*)(/release)", LPN)[1]
for LPN
in LPNs}
122 if len(prefixes) > 1:
123 raise ValueError(
"Somehow got multiple different prefixes!")
125 prefix = list(prefixes)[0]
126 LPNs = [re.sub(
r"^.*(release)",
r"\1", LPN)
for LPN
in LPNs]
129 df = pd.DataFrame([Path(LPN).parts
for LPN
in LPNs])
163 df.rename(columns=columns, inplace=
True)
164 verify_dataframe(df, args.mc)
167 df.loc[:,
"expNumber"] = df[
"s00"] +
"/" + df[
"expNumber"]
168 df.drop(
"s00", axis=1, inplace=
True)
174 ExpGroups = df.groupby([
"campaign",
"expNumber"])
175 for (campaign, expNumber), ExpGroup
in ExpGroups:
176 groups = ExpGroup.groupby([
"beamEnergy",
"release",
"DBGT",
"prodNumber"])
177 for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group))
in enumerate(
181 prodNumber = int(re.sub(
r"^prod0*",
"", prodNumber))
182 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
183 expInteger = int(re.sub(
r"^e0*",
"", expNumber))
186 onres = beamEnergy ==
"4S"
188 label = f
"{campaign}_exp{expInteger}r{iGroup+1}"
190 label = f
"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
193 DataBlocks[label] = {
194 "sampleLabel": (f
"{campaign}_exp{expInteger}" if onres
else "???"),
196 "inputReleaseNumber": release,
197 "prodNumber": prodNumber,
198 "inputDBGlobalTag": DBGT,
199 "procNumber": campaign,
200 "experimentNumber": expNumber,
201 "beamEnergy": beamEnergy,
202 "inputDataLevel":
"mdst",
203 "runNumbers": list(group[
"runNumber"]),
207 df.loc[:,
"prodNumber"] = (
208 df[
"prodNumber"].str.replace(
"^prod0*",
"").astype(int)
211 MCTypeGroups = df.groupby([
"campaign",
"MCEventType"])
212 for (campaign, MCEventType), MCTypeGroup
in MCTypeGroups:
213 groups = MCTypeGroup.groupby(
214 [
"beamEnergy",
"expNumber",
"release",
"DBGT",
"runNumber"]
218 ((beamEnergy, expNumber, release, DBGT, runNumber), group),
219 )
in enumerate(groups):
221 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
224 label = f
"{campaign}_{MCEventType}{args.bg}"
225 BlockLabel = f
"{label}r{iGroup+1}"
228 DataBlocks[BlockLabel] = {
229 "sampleLabel": label,
231 "inputReleaseNumber": release,
232 "mcCampaign": campaign,
233 "prodNumber": list(group[
"prodNumber"]),
234 "inputDBGlobalTag": DBGT,
235 "experimentNumber": expNumber,
236 "beamEnergy": beamEnergy,
237 "mcType": MCEventType,
238 "mcBackground": args.bg,
239 "inputDataLevel":
"mdst",
240 "runNumber": runNumber,
243 to_yaml(DataBlocks, args.output)
246 if __name__ ==
"__main__":
251 "[WARNING] Please check that the 'sampleLabel' entries in the output "
252 "YAML file match sample labels in skim/scripts/TestFiles.yaml."