Belle II Software  release-06-00-14
lpns2yaml.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 """\
13 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
14 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
15 LPNs, like those which can be downloaded from the dataset searcher.
16 
17 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
18 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
19 running the script.
20 
21 --epilog--
22 .. rubric:: Example usage
23 
24 * Convert list of BGx1 MC LPNs into YAML format and print to screen::
25 
26  $ %(prog)s my_MC_LPNs.txt --mc --bg BGx1
27 
28 * Convert list of data LPNs into YAML format and save to file::
29 
30  $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
31 
32 """
33 
34 import argparse
35 import pandas as pd
36 from pathlib import Path
37 import re
38 import sys
39 
40 from skim.utils.testfiles import DataSample, MCSample
41 
42 import yaml
43 
44 
45 def get_argument_parser():
46  description, epilog = __doc__.split("--epilog--")
47  parser = argparse.ArgumentParser(
48  description=description,
49  formatter_class=argparse.RawDescriptionHelpFormatter,
50  epilog=epilog,
51  )
52  parser.add_argument(
53  metavar="input_lpn_list_file",
54  dest="input",
55  help="Input file containing list of LPNs (such as that from the dataset searcher).",
56  )
57  parser.add_argument(
58  "-o",
59  metavar="output_filename",
60  dest="output",
61  help="Output YAML file name. If none given, prints output to screen.",
62  )
63  DataMCGroup = parser.add_mutually_exclusive_group(required=True)
64  DataMCGroup.add_argument(
65  "--data", action="store_true", help="Flag to indicate the LPNs are for data."
66  )
67  DataMCGroup.add_argument(
68  "--mc", action="store_true", help="Flag to indicate the LPNs are for MC."
69  )
70 
71  parser.add_argument(
72  "--bg",
73  choices=("BGx0", "BGx1"),
74  required=("--mc" in sys.argv),
75  help="Beam background level of MC samples. Only required for MC.",
76  )
77  return parser
78 
79 
80 def verify_dataframe(df, mc):
81  # Check that we got the names of the columns right
82  if not (
83  all(df["release"].str.startswith("release-"))
84  and all(df["DBGT"].str.startswith("DB"))
85  and all(df["expNumber"].str.startswith("e"))
86  and all(df["runNumber"].str.startswith("r"))
87  ):
88  raise ValueError(
89  "The column values don't seem to line up with what's expected."
90  )
91 
92  # Check assumptions about columns
93  if not all(df["dataLevel"].str.match("mdst")):
94  raise ValueError("Input LPNs must all be mdst.")
95  if "generalSkimName" in df.columns and len(set(df["generalSkimName"])) > 1:
96  raise ValueError("More than one GeneralSkimName in input data LPNs.")
97  if mc and len(set(df["runNumber"])) > 1:
98  raise ValueError("More than one run number listed for MC LPNs.")
99 
100 
101 def to_yaml(data, output=None):
102  """Print to screen or file as YAML format."""
103  string = yaml.dump(data, sort_keys=False)
104 
105  # Add warning about sample labels
106  warning = "# TODO: Ensure this label matches a sample label in SkimStats.json"
107  string = re.sub("(sampleLabel.*)$", f"\\1 {warning}", string, flags=re.MULTILINE)
108 
109  if isinstance(output, (str, Path)):
110  OutputFilename = Path(output).with_suffix(".yaml")
111  with open(OutputFilename, "w") as f:
112  f.write(string)
113  print(f"Wrote YAML file to {OutputFilename}", file=sys.stderr)
114  elif not output:
115  print(string)
116 
117 
118 def main():
119  parser = get_argument_parser()
120  args = parser.parse_args()
121 
122  # Read in data
123  with open(args.input) as f:
124  LPNs = sorted(filter(None, f.read().split("\n")))
125 
126  # Trim prefix (everything up to the release number)
127  prefixes = {re.match(r"^(.*)(/release)", LPN)[1] for LPN in LPNs}
128  if len(prefixes) > 1:
129  raise ValueError("Somehow got multiple different prefixes!")
130  else:
131  prefix = list(prefixes)[0]
132  LPNs = [re.sub(r"^.*(release)", r"\1", LPN) for LPN in LPNs]
133 
134  # ...and put it all into a lovely dataframe, split by LPN part!
135  df = pd.DataFrame([Path(LPN).parts for LPN in LPNs])
136  if args.data:
137  if len(df.columns) == 8:
138  # If eight components to LPN, then we're dealing with the old data LPN schema
139  columns = dict(
140  enumerate(
141  [
142  "release",
143  "DBGT",
144  "campaign",
145  "prodNumber",
146  "expNumber",
147  "beamEnergy",
148  "runNumber",
149  "dataLevel",
150  ]
151  )
152  )
153  elif len(df.columns) == 9:
154  # If nine components to LPN, then we're dealing with the old data LPN schema,
155  # which includes an additional GeneralSkimName component
156  columns = dict(
157  enumerate(
158  [
159  "release",
160  "DBGT",
161  "campaign",
162  "prodNumber",
163  "expNumber",
164  "beamEnergy",
165  "runNumber",
166  "generalSkimName",
167  "dataLevel",
168  ]
169  )
170  )
171  else:
172  columns = dict(
173  enumerate(
174  [
175  "release",
176  "DBGT",
177  "campaign",
178  "prodNumber",
179  "s00",
180  "expNumber",
181  "beamEnergy",
182  "runNumber",
183  "MCEventType",
184  "dataLevel",
185  ]
186  )
187  )
188 
189  df.rename(columns=columns, inplace=True)
190  verify_dataframe(df, args.mc)
191 
192  if args.mc:
193  df.loc[:, "expNumber"] = df["s00"] + "/" + df["expNumber"]
194  df.drop("s00", axis=1, inplace=True)
195 
196  DataBlocks = {}
197 
198  # Group into blocks, to make sure everything in block has the same prodID, release, etc.
199  if args.data:
200  ExpGroups = df.groupby(["campaign", "expNumber"])
201  for (campaign, expNumber), ExpGroup in ExpGroups:
202  groups = ExpGroup.groupby(["beamEnergy", "release", "DBGT", "prodNumber"])
203  for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group)) in enumerate(
204  groups
205  ):
206  # Extract integers from columns
207  prodNumber = int(re.sub(r"^prod0*", "", prodNumber))
208  DBGT = int(re.sub(r"^DB0*", "", DBGT))
209  expInteger = int(re.sub(r"^e0*", "", expNumber))
210 
211  # If beam energy is not 4S, then point it out in label
212  onres = beamEnergy == "4S"
213  if onres:
214  label = f"{campaign}_exp{expInteger}r{iGroup+1}"
215  else:
216  label = f"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
217 
218  if "generalSkimName" in df.columns:
219  generalSkim = list(group["generalSkimName"])[0]
220  else:
221  generalSkim = "all"
222 
223  # Use sample name encoding from DataSample
224  sampleLabel = DataSample(
225  location="DUMMY_PATH",
226  processing=campaign,
227  experiment=expInteger,
228  beam_energy=beamEnergy,
229  general_skim=generalSkim,
230  ).encodeable_name
231 
232  # Add everything to our mega dict
233  DataBlocks[label] = {
234  "sampleLabel": sampleLabel,
235  "LPNPrefix": prefix,
236  "inputReleaseNumber": release,
237  "prodNumber": prodNumber,
238  "inputDBGlobalTag": DBGT,
239  "procNumber": campaign,
240  "experimentNumber": expNumber,
241  "beamEnergy": beamEnergy,
242  "inputDataLevel": "mdst",
243  "runNumbers": list(group["runNumber"]),
244  }
245 
246  if "generalSkimName" in df.columns:
247  DataBlocks[label]["generalSkimName"] = list(
248  group["generalSkimName"]
249  )[0]
250  else:
251  # Extract integers from columns
252  df.loc[:, "prodNumber"] = (
253  df["prodNumber"].str.replace("^prod0*", "").astype(int)
254  )
255 
256  MCTypeGroups = df.groupby(["campaign", "MCEventType"])
257  for (campaign, MCEventType), MCTypeGroup in MCTypeGroups:
258  groups = MCTypeGroup.groupby(
259  ["beamEnergy", "expNumber", "release", "DBGT", "runNumber"]
260  )
261  for (
262  iGroup,
263  ((beamEnergy, expNumber, release, DBGT, runNumber), group),
264  ) in enumerate(groups):
265  # Extract integers from columns
266  DBGT = int(re.sub(r"^DB0*", "", DBGT))
267 
268  # If beam energy is not 4S, then point it out in label
269  label = f"{campaign}_{MCEventType}{args.bg}"
270  BlockLabel = f"{label}r{iGroup+1}"
271 
272  # Use sample name encoding from MCSample
273  sampleLabel = MCSample(
274  location="DUMMY_PATH",
275  process=MCEventType,
276  campaign=campaign,
277  beam_energy=beamEnergy,
278  beam_background=args.bg,
279  ).encodeable_name
280 
281  # Add everything to our mega dict
282  DataBlocks[BlockLabel] = {
283  "sampleLabel": sampleLabel,
284  "LPNPrefix": prefix,
285  "inputReleaseNumber": release,
286  "mcCampaign": campaign,
287  "prodNumber": list(group["prodNumber"]),
288  "inputDBGlobalTag": DBGT,
289  "experimentNumber": expNumber,
290  "beamEnergy": beamEnergy,
291  "mcType": MCEventType,
292  "mcBackground": args.bg,
293  "inputDataLevel": "mdst",
294  "runNumber": runNumber,
295  }
296 
297  to_yaml(DataBlocks, args.output)
298 
299 
300 if __name__ == "__main__":
301  main()
std::map< ExpRun, std::pair< double, double > > filter(const std::map< ExpRun, std::pair< double, double >> &runs, double cut, std::map< ExpRun, std::pair< double, double >> &runsRemoved)
filter events to remove runs shorter than cut, it stores removed runs in runsRemoved
Definition: Splitter.cc:40
int main(int argc, char **argv)
Run all tests.
Definition: test_main.cc:75