Belle II Software  release-06-01-15
testfiles.py
1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 
4 
11 
12 
13 import re
14 import sys
15 from pathlib import Path
16 
17 import json
18 import jsonschema
19 import yaml
20 
21 from basf2 import find_file
22 
23 
24 class Sample:
25  """Base class for skim test samples."""
26 
27  def __init__(self, **kwargs):
28  """
29  Initialise Sample. Passing any unrecognised keywords will raise an error.
30  """
31  if kwargs:
32  keys = ", ".join(kwargs.keys())
33  raise ValueError(
34  f"Unrecognised arguments in test sample initialisation: {keys}"
35  )
36 
37  location = NotImplemented
38  """Path of the test file."""
39 
40  @property
41  def encodeable_name(self):
42  """
43  Identifying string which is safe to be included as a filename component or as a
44  key in the skim stats JSON file.
45 
46  As a rough naming convention, data samples should start with 'Data-', MC sample
47  with 'MC-', and custom samples with 'Custom-'.
48  """
49  return NotImplemented
50 
51  @property
52  def printable_name(self):
53  """
54  Human-readable name for displaying in printed tables.
55  """
56  return NotImplemented
57 
58  @staticmethod
59  def resolve_path(location):
60  """
61  Replace ``'${SampleDirectory}'`` with ``Sample.SampleDirectory``, and resolve
62  the path.
63 
64  Parameters:
65  location (str, pathlib.Path): Filename to be resolved.
66 
67  Returns:
68  pathlib.Path: Resolved path.
69  """
70  SampleDirectory = "/group/belle2/dataprod/MC/SkimTraining"
71  location = str(location).replace("${SampleDirectory}", SampleDirectory)
72  return Path(location).expanduser().resolve()
73 
74  @property
75  def as_dict(self):
76  """
77  Sample serialised as a dictionary.
78  """
79  return NotImplemented
80 
81  def __str__(self):
82  return self.encodeable_nameencodeable_name
83 
84 
86  def __init__(
87  self,
88  *,
89  location,
90  processing,
91  experiment,
92  beam_energy="4S",
93  general_skim="all",
94  **kwargs,
95  ):
96  # Pass unrecognised kwargs to base class
97  super().__init__(**kwargs)
98 
99  self.locationlocationlocation = self.resolve_pathresolve_path(location)
100  self.processingprocessing = processing
101  if isinstance(experiment, int) or not experiment.startswith("exp"):
102  experiment = f"exp{experiment}"
103  self.experimentexperiment = experiment
104  self.beam_energybeam_energy = beam_energy
105  self.general_skimgeneral_skim = general_skim
106 
107  def __repr__(self):
108  return (
109  f"{self.__class__.__name__}("
110  f"location={repr(self.location)}, "
111  f"processing={repr(self.processing)}, "
112  f"experiment={repr(self.experiment)}, "
113  f"beam_energy={repr(self.beam_energy)}, "
114  f"general_skim={repr(self.general_skim)})"
115  )
116 
117  @property
118  def as_dict(self):
119  return {
120  "location": str(self.locationlocationlocation),
121  "processing": self.processingprocessing,
122  "experiment": self.experimentexperiment,
123  "beam_energy": self.beam_energybeam_energy,
124  "general_skim": self.general_skimgeneral_skim,
125  }
126 
127  @property
128  def encodeable_name(self):
129  return "-".join(
130  (
131  "Data",
132  self.processingprocessing,
133  self.experimentexperiment,
134  self.beam_energybeam_energy,
135  self.general_skimgeneral_skim,
136  )
137  )
138 
139  @property
140  def printable_name(self):
141  name = f"{self.processing} {self.experiment}"
142  # Only print additional info in non-default situations
143  if self.beam_energybeam_energy != "4S":
144  name += f", {self.beam_energy}"
145  if self.general_skimgeneral_skim != "all":
146  name += f", ({self.general_skim})"
147  return name
148 
149 
151  def __init__(
152  self,
153  *,
154  location,
155  process,
156  campaign,
157  beam_energy="4S",
158  beam_background="BGx1",
159  **kwargs,
160  ):
161  # Pass unrecognised kwargs to base class
162  super().__init__(**kwargs)
163 
164  self.locationlocationlocation = self.resolve_pathresolve_path(location)
165  self.processprocess = process
166  self.beam_energybeam_energy = beam_energy
167 
168  if isinstance(campaign, int) or not campaign.startswith("MC"):
169  campaign = f"MC{campaign}"
170  self.campaigncampaign = campaign
171 
172  if isinstance(beam_background, int) or not beam_background.startswith("BGx"):
173  beam_background = f"BGx{beam_background}"
174  self.beam_backgroundbeam_background = beam_background
175 
176  def __repr__(self):
177  return (
178  f"{self.__class__.__name__}("
179  f"location={repr(self.location)}, "
180  f"process={repr(self.process)}, "
181  f"campaign={repr(self.campaign)}, "
182  f"beam_energy={repr(self.beam_energy)}, "
183  f"beam_background={repr(self.beam_background)})"
184  )
185 
186  @property
187  def as_dict(self):
188  return {
189  "location": str(self.locationlocationlocation),
190  "process": self.processprocess,
191  "campaign": self.campaigncampaign,
192  "beam_energy": self.beam_energybeam_energy,
193  "beam_background": self.beam_backgroundbeam_background,
194  }
195 
196  @property
197  def encodeable_name(self):
198  return "-".join(
199  ("MC", self.campaigncampaign, self.beam_energybeam_energy, self.processprocess, self.beam_backgroundbeam_background)
200  )
201 
202  @property
203  def printable_name(self):
204  name = f"{self.campaign} {self.process}"
205  # Only print additional info in non-default situations
206  if self.beam_backgroundbeam_background != "BGx1":
207  name += f" {self.beam_background}"
208  if self.beam_energybeam_energy != "4S":
209  name += f", {self.beam_energy}"
210  return name
211 
212 
214  def __init__(self, *, location, label=None, **kwargs):
215  # Pass unrecognised kwargs to base class
216  super().__init__(**kwargs)
217 
218  self.locationlocationlocation = self.resolve_pathresolve_path(location)
219  if label is None:
220  self.labellabel = str(location)
221  else:
222  self.labellabel = label
223 
224  self.sanitised_labelsanitised_label = re.sub(r"[^A-Za-z0-9]", "", self.labellabel)
225 
226  def __repr__(self):
227  return (
228  f"{self.__class__.__name__}("
229  f"location={repr(self.location)}, "
230  f"label={repr(self.label)})"
231  )
232 
233  @property
234  def as_dict(self):
235  return {"location": str(self.locationlocationlocation), "label": self.labellabel}
236 
237  @property
238  def encodeable_name(self):
239  return f"Custom-{self.sanitised_label}"
240 
241  @property
242  def printable_name(self):
243  return self.labellabel
244 
245 
247  """Container class for lists of MC, data, and custom samples."""
248 
249  DefaultSampleYAML = (
250  "/group/belle2/dataprod/MC/SkimTraining/SampleLists/TestFiles.yaml"
251  )
252 
253  def __init__(self, *, SampleYAML=None, SampleDict=None, SampleList=None):
254  """
255  Initialise a list of test samples. Three methods are of initialisation are
256  allowed. If no arguments are given this function will default to a standard list
257  of samples defined in
258  ``/group/belle2/dataprod/MC/SkimTraining/SampleLists/TestFiles.yaml``.
259 
260  Parameters:
261  SampleYAML (str, pathlib.path): Path to a YAML file containing sample
262  specifications.
263  SampleDict (dict): Dict containing sample specifications.
264  SampleList (list(Sample)): List of Sample objects.
265  """
266  if sum(p is not None for p in (SampleYAML, SampleDict, SampleList)) > 1:
267  raise ValueError(
268  "Only one out of SampleYAML, SampleDict, or SampleList can be passed."
269  )
270 
271  if SampleList is not None:
272  # Initialise from list of Sample objects
273  self.mc_samplesmc_samples = [s for s in SampleList if isinstance(s, MCSample)]
274  self.data_samplesdata_samples = [s for s in SampleList if isinstance(s, DataSample)]
275  self.custom_samplescustom_samples = [s for s in SampleList if isinstance(s, CustomSample)]
276  return
277 
278  if SampleDict is None:
279  if SampleYAML is None:
280  SampleYAML = self.DefaultSampleYAMLDefaultSampleYAML
281 
282  with open(SampleYAML) as f:
283  SampleDict = yaml.safe_load(f)
284 
285  self.validate_schemavalidate_schema(SampleDict, SampleYAML)
286 
287  self._parse_all_samples_parse_all_samples(SampleDict)
288 
289  @property
290  def _all_samples(self):
291  return [*self.mc_samplesmc_samples, *self.data_samplesdata_samples, *self.custom_samplescustom_samples]
292 
293  def __iter__(self):
294  yield from self._all_samples_all_samples
295 
296  def __getitem__(self, i):
297  return self._all_samples_all_samples[i]
298 
299  def __len__(self):
300  return len(self._all_samples_all_samples)
301 
302  def __repr__(self):
303  return f"{self.__class__.__name__}(" f"SampleList={repr(list(self))})"
304 
305  @property
306  def SampleDict(self):
307  return {
308  "MC": [s.as_dict for s in self.mc_samplesmc_samples],
309  "Data": [s.as_dict for s in self.data_samplesdata_samples],
310  "Custom": [s.as_dict for s in self.custom_samplescustom_samples],
311  }
312 
313  def validate_schema(self, SampleDict, InputYAML=None):
314  """
315  Validate YAML input against JSON schema defined in
316  ``skim/tools/resources/test_samples_schema.json``.
317  """
318  schema_file = find_file("skim/tools/resources/test_samples_schema.json")
319  with open(schema_file) as f:
320  schema = json.load(f)
321 
322  try:
323  jsonschema.validate(SampleDict, schema)
324  except jsonschema.exceptions.ValidationError as e:
325  if InputYAML:
326  raise ValueError(
327  f"Error in sample list configuration file {InputYAML}"
328  ) from e
329  raise e
330 
331  @staticmethod
332  def _parse_samples(SampleDict, BlockName, SampleClass):
333  if SampleDict is None:
334  return []
335 
336  try:
337  InputSampleList = SampleDict[BlockName]
338  except KeyError:
339  return []
340 
341  if InputSampleList is None:
342  return []
343 
344  samples = []
345  for sample in InputSampleList:
346  samples.append(SampleClass(**sample))
347  return samples
348 
349  def _parse_all_samples(self, SampleDict):
350  """Read in each block of the YAML and create lists of sample objects."""
351  MissingParams = (
352  "Error in '{block}' block of test sample yaml file.\n"
353  "The following must all have defined values: {params}"
354  )
355 
356  try:
357  self.data_samplesdata_samples = self._parse_samples_parse_samples(SampleDict, "Data", DataSample)
358  except TypeError as e:
359  required = ", ".join(
360  f"'{p}'"
361  for p in ("location", "processing", "beam_energy", "experiment")
362  )
363  raise ValueError(MissingParams.format(block="Data", params=required)) from e
364 
365  try:
366  self.mc_samplesmc_samples = self._parse_samples_parse_samples(SampleDict, "MC", MCSample)
367  except TypeError as e:
368  required = ", ".join(f"'{p}'" for p in ("location", "process", "campaign"))
369  raise ValueError(MissingParams.format(block="MC", params=required)) from e
370 
371  try:
372  self.custom_samplescustom_samples = self._parse_samples_parse_samples(
373  SampleDict, "Custom", CustomSample
374  )
375  except TypeError as e:
376  required = ", ".join(f"'{p}'" for p in ("location",))
377  raise ValueError(
378  MissingParams.format(block="Custom", params=required)
379  ) from e
380 
382  self,
383  *,
384  process=None,
385  campaign=None,
386  beam_energy=None,
387  beam_background=None,
388  exact_match=False,
389  inplace=False,
390  ):
391  """
392  Find all MC samples matching query.
393 
394  Parameters:
395  process (str): Simulated MC process to query.
396  campaign (str, int): MC campaign number to query.
397  beam_energy (str): Beam energy to query.
398  beam_background (str, int): Nominal beam background to query.
399  exact_match (bool): If passed, an error is raised if there is not exactly
400  one matching sample. If there is exactly one matching sample, then the
401  single sample is returned, rather than a list.
402  inplace (bool): Replace MC samples with the list obtained from query.
403  """
404  if inplace and exact_match:
405  raise ValueError(
406  "Incompatible arguments passed: `inplace` and `exact_match`"
407  )
408 
409  samples = [
410  s
411  for s in self.mc_samplesmc_samples
412  if (process is None or s.process == process)
413  and (campaign is None or s.campaign == campaign)
414  and (beam_energy is None or s.beam_energy == beam_energy)
415  and (beam_background is None or s.beam_background == beam_background)
416  ]
417  if exact_match:
418  if len(samples) == 1:
419  return samples[0]
420  else:
421  raise ValueError(
422  "`exact_match=True` was specified, but did not find exactly one match."
423  )
424  else:
425  if inplace:
426  self.mc_samplesmc_samples = samples
427  else:
428  return samples
429 
431  self,
432  *,
433  processing=None,
434  experiment=None,
435  beam_energy=None,
436  general_skim=None,
437  exact_match=False,
438  inplace=False,
439  ):
440  """
441  Find all MC samples matching query.
442 
443  Parameters:
444  processing (str): Data processing campaign number to query.
445  experiment (str, int): Experiment number to query.
446  beam_energy (str): Beam energy to query.
447  general_skim (skim): ``GeneralSkimName`` to query.
448  exact_match (bool): If passed, an error is raised if there is not exactly
449  one matching sample. If there is exactly one matching sample, then the
450  single sample is returned, rather than a list.
451  inplace (bool): Replace MC samples with the list obtained from query.
452  """
453  if inplace and exact_match:
454  raise ValueError(
455  "Incompatible arguments passed: `inplace` and `exact_match`"
456  )
457 
458  samples = [
459  s
460  for s in self.data_samplesdata_samples
461  if (processing is None or s.processing == processing)
462  and (experiment is None or s.experiment == experiment)
463  and (beam_energy is None or s.beam_energy == beam_energy)
464  and (general_skim is None or s.general_skim == general_skim)
465  ]
466  if exact_match:
467  if len(samples) == 1:
468  return samples[0]
469  else:
470  raise ValueError(
471  "`exact_match=True` was specified, but did not find exactly one match."
472  )
473  else:
474  if inplace:
475  self.data_samplesdata_samples = samples
476  else:
477  return samples
478 
479 
480 def get_test_file(process, *, SampleYAML=None):
481  """
482  Attempt to find a test sample of the given MC process.
483 
484  Parameters:
485  process (str): Physics process, e.g. mixed, charged, ccbar, eemumu.
486  SampleYAML (str, pathlib.Path): Path to a YAML file containing sample
487  specifications.
488 
489  Returns:
490  str: Path to test sample file.
491 
492  Raises:
493  FileNotFoundError: Raised if no sample can be found.
494  """
495  samples = TestSampleList(SampleYAML=SampleYAML)
496  matches = samples.query_mc_samples(process=process)
497  try:
498  # Return the first match found
499  return matches[0].location
500  except IndexError as e:
501  raise ValueError(f"No test samples found for MC process '{process}'.") from e
502 
503 
504 if __name__ == "__main__":
505  # Print the parsed contents of the YAML file
506  try:
507  samples = TestSampleList(SampleYAML=sys.argv[1])
508  except IndexError:
509  samples = TestSampleList()
510 
511  print("Samples defined in YAML file:")
512  for sample in samples:
513  print(f" * {repr(sample)}")
def printable_name(self)
Definition: testfiles.py:242
def encodeable_name(self)
Definition: testfiles.py:238
def printable_name(self)
Definition: testfiles.py:140
def encodeable_name(self)
Definition: testfiles.py:128
def as_dict(self)
Definition: testfiles.py:187
def printable_name(self)
Definition: testfiles.py:203
def encodeable_name(self)
Definition: testfiles.py:197
def resolve_path(location)
Definition: testfiles.py:59
def as_dict(self)
Definition: testfiles.py:75
def printable_name(self)
Definition: testfiles.py:52
def __init__(self, **kwargs)
Definition: testfiles.py:27
def encodeable_name(self)
Definition: testfiles.py:41
def validate_schema(self, SampleDict, InputYAML=None)
Definition: testfiles.py:313
def query_data_samples(self, *processing=None, experiment=None, beam_energy=None, general_skim=None, exact_match=False, inplace=False)
Definition: testfiles.py:439
def query_mc_samples(self, *process=None, campaign=None, beam_energy=None, beam_background=None, exact_match=False, inplace=False)
Definition: testfiles.py:390
def _parse_samples(SampleDict, BlockName, SampleClass)
Definition: testfiles.py:332
def _parse_all_samples(self, SampleDict)
Definition: testfiles.py:349
def __init__(self, *SampleYAML=None, SampleDict=None, SampleList=None)
Definition: testfiles.py:253