Belle II Software  release-06-02-00
lpns2yaml.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 """\
13 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
14 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
15 LPNs, like those which can be downloaded from the dataset searcher.
16 
17 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
18 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
19 running the script.
20 
21 --epilog--
22 .. rubric:: Example usage
23 
24 * Convert list of BGx1 MCri LPNs into YAML format and print to screen::
25 
26  $ %(prog)s my_MCri_LPNs.txt --mcri --bg BGx1
27 
28 * Convert list of BGx1 MCrd LPNs into YAML format and print to screen::
29 
30  $ %(prog)s my_MCrd_LPNs.txt --mcrd --bg BGx1
31 
32 * Convert list of data LPNs into YAML format and save to file::
33 
34  $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
35 
36 """
37 
38 import argparse
39 import pandas as pd
40 from pathlib import Path
41 import re
42 import sys
43 
44 from skim.utils.testfiles import DataSample, MCSample
45 
46 import yaml
47 
48 
49 def get_argument_parser():
50  description, epilog = __doc__.split("--epilog--")
51  parser = argparse.ArgumentParser(
52  description=description,
53  formatter_class=argparse.RawDescriptionHelpFormatter,
54  epilog=epilog,
55  )
56  parser.add_argument(
57  metavar="input_lpn_list_file",
58  dest="input",
59  help="Input file containing list of LPNs (such as that from the dataset searcher).",
60  )
61  parser.add_argument(
62  "-o",
63  metavar="output_filename",
64  dest="output",
65  help="Output YAML file name. If none given, prints output to screen.",
66  )
67  DataMCGroup = parser.add_mutually_exclusive_group(required=True)
68  DataMCGroup.add_argument(
69  "--data", action="store_true", help="Flag to indicate the LPNs are for data."
70  )
71  DataMCGroup.add_argument(
72  "--mcri", action="store_true", help="Flag to indicate the LPNs are for run-independent MC."
73  )
74  DataMCGroup.add_argument(
75  "--mcrd", action="store_true", help="Flag to indicate the LPNs are for run-dependent MC."
76  )
77 
78  parser.add_argument(
79  "--bg",
80  choices=("BGx0", "BGx1"),
81  required=("--mcri" in sys.argv or "--mcrd" in sys.argv),
82  help="Beam background level of MC samples. Only required for MC.",
83  )
84  return parser
85 
86 
87 def verify_dataframe(df, mcri):
88  # Check that we got the names of the columns right
89  if not (
90  all(df["release"].str.startswith("release-"))
91  and all(df["DBGT"].str.startswith("DB"))
92  and all(df["expNumber"].str.startswith("e"))
93  and all(df["runNumber"].str.startswith("r"))
94  ):
95  raise ValueError(
96  "The column values don't seem to line up with what's expected."
97  )
98 
99  # Check assumptions about columns
100  if not all(df["dataLevel"].str.match("mdst")):
101  raise ValueError("Input LPNs must all be mdst.")
102  if "generalSkimName" in df.columns and len(set(df["generalSkimName"])) > 1:
103  raise ValueError("More than one GeneralSkimName in input data LPNs.")
104  if mcri and len(set(df["runNumber"])) > 1:
105  raise ValueError("More than one run number listed for MC LPNs.")
106 
107 
108 def to_yaml(data, output=None):
109  """Print to screen or file as YAML format."""
110  string = yaml.dump(data, sort_keys=False)
111 
112  # Add warning about sample labels
113  warning = "# TODO: Ensure this label matches a sample label in SkimStats.json"
114  string = re.sub("(sampleLabel.*)$", f"\\1 {warning}", string, flags=re.MULTILINE)
115 
116  if isinstance(output, (str, Path)):
117  OutputFilename = Path(output).with_suffix(".yaml")
118  with open(OutputFilename, "w") as f:
119  f.write(string)
120  print(f"Wrote YAML file to {OutputFilename}", file=sys.stderr)
121  elif not output:
122  print(string)
123 
124 
125 def main():
126  parser = get_argument_parser()
127  args = parser.parse_args()
128 
129  # Read in data
130  with open(args.input) as f:
131  LPNs = sorted(filter(None, f.read().split("\n")))
132 
133  # Trim prefix (everything up to the release number)
134  prefixes = {re.match(r"^(.*)(/release)", LPN)[1] for LPN in LPNs}
135  if len(prefixes) > 1:
136  raise ValueError("Somehow got multiple different prefixes!")
137  else:
138  prefix = list(prefixes)[0]
139  LPNs = [re.sub(r"^.*(release)", r"\1", LPN) for LPN in LPNs]
140 
141  # ...and put it all into a lovely dataframe, split by LPN part!
142  df = pd.DataFrame([Path(LPN).parts for LPN in LPNs])
143  if args.data:
144  if len(df.columns) == 8:
145  # If eight components to LPN, then we're dealing with the old data LPN schema
146  columns = dict(
147  enumerate(
148  [
149  "release",
150  "DBGT",
151  "campaign",
152  "prodNumber",
153  "expNumber",
154  "beamEnergy",
155  "runNumber",
156  "dataLevel",
157  ]
158  )
159  )
160  elif len(df.columns) == 9:
161  # If nine components to LPN, then we're dealing with the new data LPN schema,
162  # which includes an additional GeneralSkimName component
163  columns = dict(
164  enumerate(
165  [
166  "release",
167  "DBGT",
168  "campaign",
169  "prodNumber",
170  "expNumber",
171  "beamEnergy",
172  "runNumber",
173  "generalSkimName",
174  "dataLevel",
175  ]
176  )
177  )
178  else:
179  columns = dict(
180  enumerate(
181  [
182  "release",
183  "DBGT",
184  "campaign",
185  "prodNumber",
186  "s00",
187  "expNumber",
188  "beamEnergy",
189  "runNumber",
190  "MCEventType",
191  "dataLevel",
192  ]
193  )
194  )
195 
196  df.rename(columns=columns, inplace=True)
197  verify_dataframe(df, args.mcri)
198 
199  if args.mcri or args.mcrd:
200  df.loc[:, "expNumber"] = df["s00"] + "/" + df["expNumber"]
201  df.drop("s00", axis=1, inplace=True)
202 
203  DataBlocks = {}
204 
205  # Group into blocks, to make sure everything in block has the same prodID, release, etc.
206  if args.data:
207  ExpGroups = df.groupby(["campaign", "expNumber"])
208  for (campaign, expNumber), ExpGroup in ExpGroups:
209  groups = ExpGroup.groupby(["beamEnergy", "release", "DBGT", "prodNumber"])
210  for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group)) in enumerate(
211  groups
212  ):
213  # Extract integers from columns
214  prodNumber = int(re.sub(r"^prod0*", "", prodNumber))
215  DBGT = int(re.sub(r"^DB0*", "", DBGT))
216  expInteger = int(re.sub(r"^e0*", "", expNumber))
217 
218  # If beam energy is not 4S, then point it out in label
219  onres = beamEnergy == "4S"
220  if onres:
221  label = f"{campaign}_exp{expInteger}r{iGroup+1}"
222  else:
223  label = f"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
224 
225  if "generalSkimName" in df.columns:
226  generalSkim = list(group["generalSkimName"])[0]
227  else:
228  generalSkim = "all"
229 
230  # Use sample name encoding from DataSample
231  sampleLabel = DataSample(
232  location="DUMMY_PATH",
233  processing=campaign,
234  experiment=expInteger,
235  beam_energy=beamEnergy,
236  general_skim=generalSkim,
237  ).encodeable_name
238 
239  # Add everything to our mega dict
240  DataBlocks[label] = {
241  "sampleLabel": sampleLabel,
242  "LPNPrefix": prefix,
243  "inputReleaseNumber": release,
244  "prodNumber": prodNumber,
245  "inputDBGlobalTag": DBGT,
246  "procNumber": campaign,
247  "experimentNumber": expNumber,
248  "beamEnergy": beamEnergy,
249  "inputDataLevel": "mdst",
250  "runNumbers": list(group["runNumber"]),
251  }
252 
253  if "generalSkimName" in df.columns:
254  DataBlocks[label]["generalSkimName"] = list(
255  group["generalSkimName"]
256  )[0]
257  elif args.mcrd:
258  ExpGroups = df.groupby(["campaign", "MCEventType", "expNumber"])
259  for (campaign, MCEventType, expNumber), ExpGroup in ExpGroups:
260  groups = ExpGroup.groupby(["beamEnergy", "release", "DBGT", "prodNumber"])
261  for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group)) in enumerate(
262  groups
263  ):
264  # Extract integers from columns
265  prodNumber = int(re.sub(r"^prod0*", "", prodNumber))
266  DBGT = int(re.sub(r"^DB0*", "", DBGT))
267  expInteger = int(re.sub(r"^s00/e0*", "", expNumber))
268 
269  # If beam energy is not 4S, then point it out in label
270  onres = beamEnergy == "4S"
271  if onres:
272  label = f"{campaign}_exp{expInteger}r{iGroup+1}"
273  else:
274  label = f"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
275 
276  # Add everything to our mega dict
277  DataBlocks[label] = {
278  "sampleLabel": (f"MC-{campaign}-{beamEnergy}-{MCEventType}-{args.bg}"),
279  "LPNPrefix": prefix,
280  "inputReleaseNumber": release,
281  "mcCampaign": campaign,
282  "prodNumber": prodNumber,
283  "inputDBGlobalTag": DBGT,
284  "experimentNumber": expNumber,
285  "beamEnergy": beamEnergy,
286  "mcType": MCEventType,
287  "mcBackground": args.bg,
288  "inputDataLevel": "mdst",
289  "runNumbers": list(group["runNumber"]),
290  }
291  else:
292  # Extract integers from columns
293  df.loc[:, "prodNumber"] = (
294  df["prodNumber"].str.replace("^prod0*", "", regex=True).astype(int)
295  )
296 
297  MCTypeGroups = df.groupby(["campaign", "MCEventType"])
298  for (campaign, MCEventType), MCTypeGroup in MCTypeGroups:
299  groups = MCTypeGroup.groupby(
300  ["beamEnergy", "expNumber", "release", "DBGT", "runNumber"]
301  )
302  for (
303  iGroup,
304  ((beamEnergy, expNumber, release, DBGT, runNumber), group),
305  ) in enumerate(groups):
306  # Extract integers from columns
307  DBGT = int(re.sub(r"^DB0*", "", DBGT))
308 
309  # If beam energy is not 4S, then point it out in label
310  label = f"{campaign}_{MCEventType}{args.bg}"
311  BlockLabel = f"{label}r{iGroup+1}"
312 
313  # Use sample name encoding from MCSample
314  sampleLabel = MCSample(
315  location="DUMMY_PATH",
316  process=MCEventType,
317  campaign=campaign,
318  beam_energy=beamEnergy,
319  beam_background=args.bg,
320  ).encodeable_name
321 
322  # Add everything to our mega dict
323  DataBlocks[BlockLabel] = {
324  "sampleLabel": sampleLabel,
325  "LPNPrefix": prefix,
326  "inputReleaseNumber": release,
327  "mcCampaign": campaign,
328  "prodNumber": list(group["prodNumber"]),
329  "inputDBGlobalTag": DBGT,
330  "experimentNumber": expNumber,
331  "beamEnergy": beamEnergy,
332  "mcType": MCEventType,
333  "mcBackground": args.bg,
334  "inputDataLevel": "mdst",
335  "runNumbers": runNumber,
336  }
337 
338  to_yaml(DataBlocks, args.output)
339 
340 
341 if __name__ == "__main__":
342  main()
std::map< ExpRun, std::pair< double, double > > filter(const std::map< ExpRun, std::pair< double, double >> &runs, double cut, std::map< ExpRun, std::pair< double, double >> &runsRemoved)
filter events to remove runs shorter than cut, it stores removed runs in runsRemoved
Definition: Splitter.cc:40
int main(int argc, char **argv)
Run all tests.
Definition: test_main.cc:75