13 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
14 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
15 LPNs, like those which can be downloaded from the dataset searcher.
17 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
18 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
22 .. rubric:: Example usage
24 * Convert list of BGx1 MC LPNs into YAML format and print to screen::
26 $ %(prog)s my_MC_LPNs.txt --mc --bg BGx1
28 * Convert list of data LPNs into YAML format and save to file::
30 $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
36 from pathlib
import Path
40 from skim.utils.testfiles
import DataSample, MCSample
45 def get_argument_parser():
46 description, epilog = __doc__.split(
"--epilog--")
47 parser = argparse.ArgumentParser(
48 description=description,
49 formatter_class=argparse.RawDescriptionHelpFormatter,
53 metavar=
"input_lpn_list_file",
55 help=
"Input file containing list of LPNs (such as that from the dataset searcher).",
59 metavar=
"output_filename",
61 help=
"Output YAML file name. If none given, prints output to screen.",
63 DataMCGroup = parser.add_mutually_exclusive_group(required=
True)
64 DataMCGroup.add_argument(
65 "--data", action=
"store_true", help=
"Flag to indicate the LPNs are for data."
67 DataMCGroup.add_argument(
68 "--mc", action=
"store_true", help=
"Flag to indicate the LPNs are for MC."
73 choices=(
"BGx0",
"BGx1"),
74 required=(
"--mc" in sys.argv),
75 help=
"Beam background level of MC samples. Only required for MC.",
80 def verify_dataframe(df, mc):
83 all(df[
"release"].str.startswith(
"release-"))
84 and all(df[
"DBGT"].str.startswith(
"DB"))
85 and all(df[
"expNumber"].str.startswith(
"e"))
86 and all(df[
"runNumber"].str.startswith(
"r"))
89 "The column values don't seem to line up with what's expected."
93 if not all(df[
"dataLevel"].str.match(
"mdst")):
94 raise ValueError(
"Input LPNs must all be mdst.")
95 if "generalSkimName" in df.columns
and len(set(df[
"generalSkimName"])) > 1:
96 raise ValueError(
"More than one GeneralSkimName in input data LPNs.")
97 if mc
and len(set(df[
"runNumber"])) > 1:
98 raise ValueError(
"More than one run number listed for MC LPNs.")
101 def to_yaml(data, output=None):
102 """Print to screen or file as YAML format."""
103 string = yaml.dump(data, sort_keys=
False)
106 warning =
"# TODO: Ensure this label matches a sample label in SkimStats.json"
107 string = re.sub(
"(sampleLabel.*)$", f
"\\1 {warning}", string, flags=re.MULTILINE)
109 if isinstance(output, (str, Path)):
110 OutputFilename = Path(output).with_suffix(
".yaml")
111 with open(OutputFilename,
"w")
as f:
113 print(f
"Wrote YAML file to {OutputFilename}", file=sys.stderr)
119 parser = get_argument_parser()
120 args = parser.parse_args()
123 with open(args.input)
as f:
124 LPNs = sorted(
filter(
None, f.read().split(
"\n")))
127 prefixes = {re.match(
r"^(.*)(/release)", LPN)[1]
for LPN
in LPNs}
128 if len(prefixes) > 1:
129 raise ValueError(
"Somehow got multiple different prefixes!")
131 prefix = list(prefixes)[0]
132 LPNs = [re.sub(
r"^.*(release)",
r"\1", LPN)
for LPN
in LPNs]
135 df = pd.DataFrame([Path(LPN).parts
for LPN
in LPNs])
137 if len(df.columns) == 8:
153 elif len(df.columns) == 9:
189 df.rename(columns=columns, inplace=
True)
190 verify_dataframe(df, args.mc)
193 df.loc[:,
"expNumber"] = df[
"s00"] +
"/" + df[
"expNumber"]
194 df.drop(
"s00", axis=1, inplace=
True)
200 ExpGroups = df.groupby([
"campaign",
"expNumber"])
201 for (campaign, expNumber), ExpGroup
in ExpGroups:
202 groups = ExpGroup.groupby([
"beamEnergy",
"release",
"DBGT",
"prodNumber"])
203 for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group))
in enumerate(
207 prodNumber = int(re.sub(
r"^prod0*",
"", prodNumber))
208 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
209 expInteger = int(re.sub(
r"^e0*",
"", expNumber))
212 onres = beamEnergy ==
"4S"
214 label = f
"{campaign}_exp{expInteger}r{iGroup+1}"
216 label = f
"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
218 if "generalSkimName" in df.columns:
219 generalSkim = list(group[
"generalSkimName"])[0]
224 sampleLabel = DataSample(
225 location=
"DUMMY_PATH",
227 experiment=expInteger,
228 beam_energy=beamEnergy,
229 general_skim=generalSkim,
233 DataBlocks[label] = {
234 "sampleLabel": sampleLabel,
236 "inputReleaseNumber": release,
237 "prodNumber": prodNumber,
238 "inputDBGlobalTag": DBGT,
239 "procNumber": campaign,
240 "experimentNumber": expNumber,
241 "beamEnergy": beamEnergy,
242 "inputDataLevel":
"mdst",
243 "runNumbers": list(group[
"runNumber"]),
246 if "generalSkimName" in df.columns:
247 DataBlocks[label][
"generalSkimName"] = list(
248 group[
"generalSkimName"]
252 df.loc[:,
"prodNumber"] = (
253 df[
"prodNumber"].str.replace(
"^prod0*",
"").astype(int)
256 MCTypeGroups = df.groupby([
"campaign",
"MCEventType"])
257 for (campaign, MCEventType), MCTypeGroup
in MCTypeGroups:
258 groups = MCTypeGroup.groupby(
259 [
"beamEnergy",
"expNumber",
"release",
"DBGT",
"runNumber"]
263 ((beamEnergy, expNumber, release, DBGT, runNumber), group),
264 )
in enumerate(groups):
266 DBGT = int(re.sub(
r"^DB0*",
"", DBGT))
269 label = f
"{campaign}_{MCEventType}{args.bg}"
270 BlockLabel = f
"{label}r{iGroup+1}"
273 sampleLabel = MCSample(
274 location=
"DUMMY_PATH",
277 beam_energy=beamEnergy,
278 beam_background=args.bg,
282 DataBlocks[BlockLabel] = {
283 "sampleLabel": sampleLabel,
285 "inputReleaseNumber": release,
286 "mcCampaign": campaign,
287 "prodNumber": list(group[
"prodNumber"]),
288 "inputDBGlobalTag": DBGT,
289 "experimentNumber": expNumber,
290 "beamEnergy": beamEnergy,
291 "mcType": MCEventType,
292 "mcBackground": args.bg,
293 "inputDataLevel":
"mdst",
294 "runNumber": runNumber,
297 to_yaml(DataBlocks, args.output)
300 if __name__ ==
"__main__":
std::map< ExpRun, std::pair< double, double > > filter(const std::map< ExpRun, std::pair< double, double >> &runs, double cut, std::map< ExpRun, std::pair< double, double >> &runsRemoved)
filter events to remove runs shorter than cut, it stores removed runs in runsRemoved
int main(int argc, char **argv)
Run all tests.