Belle II Software  release-05-01-25
lpns2yaml.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 """\
5 ``%(prog)s`` is a tool for converting a list of LPNs into YAML format expected by
6 :ref:`b2skim-prod<b2skim-prod>`. The expected input to ``%(prog)s`` is a text file of
7 LPNs, like those which can be downloaded from the dataset searcher.
8 
9 The test sample labels (under the key ``sampleLabel``) are automatically generated, so
10 please check they all correspond to a label ``skim/scripts/TestFiles.yaml`` after
11 running the script.
12 
13 --epilog--
14 .. rubric:: Example usage
15 
16 * Convert list of BGx1 MC LPNs into YAML format and print to screen::
17 
18  $ %(prog)s my_MC_LPNs.txt --mc --bg BGx1
19 
20 * Convert list of data LPNs into YAML format and save to file::
21 
22  $ %(prog)s my_data_LPNs.txt --data -o my_data_LPNs.yaml
23 
24 """
25 
26 import argparse
27 import pandas as pd
28 from pathlib import Path
29 import re
30 import sys
31 
32 from termcolor import colored
33 import yaml
34 
35 
36 __author__ = "Phil Grace"
37 
38 
39 def get_argument_parser():
40  description, epilog = __doc__.split("--epilog--")
41  parser = argparse.ArgumentParser(
42  description=description,
43  formatter_class=argparse.RawDescriptionHelpFormatter,
44  epilog=epilog,
45  )
46  parser.add_argument(
47  metavar="input_lpn_list_file",
48  dest="input",
49  help="Input file containing list of LPNs (such as that from the dataset searcher).",
50  )
51  parser.add_argument(
52  "-o",
53  metavar="output_filename",
54  dest="output",
55  help="Output YAML file name. If none given, prints output to screen.",
56  )
57  DataMCGroup = parser.add_mutually_exclusive_group(required=True)
58  DataMCGroup.add_argument(
59  "--data", action="store_true", help="Flag to indicate the LPNs are for data."
60  )
61  DataMCGroup.add_argument(
62  "--mc", action="store_true", help="Flag to indicate the LPNs are for MC."
63  )
64 
65  parser.add_argument(
66  "--bg",
67  choices=("BGx0", "BGx1"),
68  required=("--mc" in sys.argv),
69  help="Beam background level of MC samples. Only required for MC.",
70  )
71  return parser
72 
73 
74 def verify_dataframe(df, mc):
75  # Check that we got the names of the columns right
76  if not (
77  all(df["release"].str.startswith("release-"))
78  and all(df["DBGT"].str.startswith("DB"))
79  and all(df["expNumber"].str.startswith("e"))
80  and all(df["runNumber"].str.startswith("r"))
81  ):
82  raise ValueError(
83  "The column values don't seem to line up with what's expected."
84  )
85 
86  # Check assumptions about columns
87  if not all(df["dataLevel"].str.match("mdst")):
88  raise ValueError("Input LPNs must all be mdst.")
89  if mc and len(set(df["runNumber"])) > 1:
90  raise ValueError("More than one run number listed for MC LPNs.")
91  if len(set(df["beamEnergy"])) > 1:
92  # Only print a warning
93  print(
94  colored(
95  "[WARNING] Input LPNs contain more than one beam energy.", "yellow"
96  ),
97  file=sys.stderr,
98  )
99 
100 
101 def to_yaml(data, output=None):
102  """Print to screen or file as YAML format."""
103  if isinstance(output, (str, Path)):
104  OutputFilename = Path(output).with_suffix(".yaml")
105  with open(OutputFilename, "w") as f:
106  yaml.dump(data, f, sort_keys=False)
107  print(f"[INFO] Wrote YAML file to {OutputFilename}", file=sys.stderr)
108  elif not output:
109  yaml.dump(data, sys.stdout, sort_keys=False)
110 
111 
112 def main():
113  parser = get_argument_parser()
114  args = parser.parse_args()
115 
116  # Read in data
117  with open(args.input) as f:
118  LPNs = sorted(filter(None, f.read().split("\n")))
119 
120  # Trim prefix (everything up to the release number)
121  prefixes = {re.match(r"^(.*)(/release)", LPN)[1] for LPN in LPNs}
122  if len(prefixes) > 1:
123  raise ValueError("Somehow got multiple different prefixes!")
124  else:
125  prefix = list(prefixes)[0]
126  LPNs = [re.sub(r"^.*(release)", r"\1", LPN) for LPN in LPNs]
127 
128  # ...and put it all into a lovely dataframe, split by LPN part!
129  df = pd.DataFrame([Path(LPN).parts for LPN in LPNs])
130  if args.data:
131  columns = dict(
132  enumerate(
133  [
134  "release",
135  "DBGT",
136  "campaign",
137  "prodNumber",
138  "expNumber",
139  "beamEnergy",
140  "runNumber",
141  "dataLevel",
142  ]
143  )
144  )
145  else:
146  columns = dict(
147  enumerate(
148  [
149  "release",
150  "DBGT",
151  "campaign",
152  "prodNumber",
153  "s00",
154  "expNumber",
155  "beamEnergy",
156  "runNumber",
157  "MCEventType",
158  "dataLevel",
159  ]
160  )
161  )
162 
163  df.rename(columns=columns, inplace=True)
164  verify_dataframe(df, args.mc)
165 
166  if args.mc:
167  df.loc[:, "expNumber"] = df["s00"] + "/" + df["expNumber"]
168  df.drop("s00", axis=1, inplace=True)
169 
170  DataBlocks = {}
171 
172  # Group into blocks, to make sure everything in block has the same prodID, release, etc.
173  if args.data:
174  ExpGroups = df.groupby(["campaign", "expNumber"])
175  for (campaign, expNumber), ExpGroup in ExpGroups:
176  groups = ExpGroup.groupby(["beamEnergy", "release", "DBGT", "prodNumber"])
177  for (iGroup, ((beamEnergy, release, DBGT, prodNumber), group)) in enumerate(
178  groups
179  ):
180  # Extract integers from columns
181  prodNumber = int(re.sub(r"^prod0*", "", prodNumber))
182  DBGT = int(re.sub(r"^DB0*", "", DBGT))
183  expInteger = int(re.sub(r"^e0*", "", expNumber))
184 
185  # If beam energy is not 4S, then point it out in label
186  onres = beamEnergy == "4S"
187  if onres:
188  label = f"{campaign}_exp{expInteger}r{iGroup+1}"
189  else:
190  label = f"{campaign}_{beamEnergy}_exp{expInteger}r{iGroup+1}"
191 
192  # Add everything to our mega dict
193  DataBlocks[label] = {
194  "sampleLabel": (f"{campaign}_exp{expInteger}" if onres else "???"),
195  "LPNPrefix": prefix,
196  "inputReleaseNumber": release,
197  "prodNumber": prodNumber,
198  "inputDBGlobalTag": DBGT,
199  "procNumber": campaign,
200  "experimentNumber": expNumber,
201  "beamEnergy": beamEnergy,
202  "inputDataLevel": "mdst",
203  "runNumbers": list(group["runNumber"]),
204  }
205  else:
206  # Extract integers from columns
207  df.loc[:, "prodNumber"] = (
208  df["prodNumber"].str.replace("^prod0*", "").astype(int)
209  )
210 
211  MCTypeGroups = df.groupby(["campaign", "MCEventType"])
212  for (campaign, MCEventType), MCTypeGroup in MCTypeGroups:
213  groups = MCTypeGroup.groupby(
214  ["beamEnergy", "expNumber", "release", "DBGT", "runNumber"]
215  )
216  for (
217  iGroup,
218  ((beamEnergy, expNumber, release, DBGT, runNumber), group),
219  ) in enumerate(groups):
220  # Extract integers from columns
221  DBGT = int(re.sub(r"^DB0*", "", DBGT))
222 
223  # If beam energy is not 4S, then point it out in label
224  label = f"{campaign}_{MCEventType}{args.bg}"
225  BlockLabel = f"{label}r{iGroup+1}"
226 
227  # Add everything to our mega dict
228  DataBlocks[BlockLabel] = {
229  "sampleLabel": label,
230  "LPNPrefix": prefix,
231  "inputReleaseNumber": release,
232  "mcCampaign": campaign,
233  "prodNumber": list(group["prodNumber"]),
234  "inputDBGlobalTag": DBGT,
235  "experimentNumber": expNumber,
236  "beamEnergy": beamEnergy,
237  "mcType": MCEventType,
238  "mcBackground": args.bg,
239  "inputDataLevel": "mdst",
240  "runNumber": runNumber,
241  }
242 
243  to_yaml(DataBlocks, args.output)
244 
245 
246 if __name__ == "__main__":
247  main()
248  print(
249  colored(
250  (
251  "[WARNING] Please check that the 'sampleLabel' entries in the output "
252  "YAML file match sample labels in skim/scripts/TestFiles.yaml."
253  ),
254  "yellow",
255  ),
256  file=sys.stderr,
257  )
Belle2::filter
std::map< ExpRun, std::pair< double, double > > filter(const std::map< ExpRun, std::pair< double, double >> &runs, double cut, std::map< ExpRun, std::pair< double, double >> &runsRemoved)
filter events to remove runs shorter than cut, it stores removed runs in runsRemoved
Definition: Splitter.cc:43
main
int main(int argc, char **argv)
Run all tests.
Definition: test_main.cc:77