Belle II Software  release-05-01-25
FormulaParser.cc
1 /**************************************************************************
2  * BASF2 (Belle Analysis Framework 2) *
3  * Copyright(C) 2018 - Belle II Collaboration *
4  * *
5  * Author: The Belle II Collaboration *
6  * Contributors: Martin Ritter *
7  * *
8  * This software is provided "as is" without any warranty. *
9  **************************************************************************/
10 
11 #include <framework/utilities/FormulaParser.h>
12 #include <stdexcept>
13 #include <cmath>
14 
15 namespace Belle2 {
21  char FormulaParserBase::operatorToChar(EOperator op) noexcept
22  {
23  switch (op) {
24  case EOperator::c_noop: return ' ';
25  case EOperator::c_plus: return '+';
26  case EOperator::c_minus: return '-';
27  case EOperator::c_multiply: return '*';
28  case EOperator::c_divide: return '/';
29  case EOperator::c_power: return '^';
30  case EOperator::c_roundBracketOpen: return '(';
31  case EOperator::c_roundBracketClose: return ')';
32  case EOperator::c_squareBracketOpen: return '[';
33  case EOperator::c_squareBracketClose: return ']';
34  }
35  return 0;
36  }
37 
38  double FormulaParserBase::applyOperator(EOperator op, double a, double b)
39  {
40  switch (op) {
41  case EOperator::c_plus: return a + b;
42  case EOperator::c_minus: return a - b;
43  case EOperator::c_multiply: return a * b;
44  case EOperator::c_divide: return a / b;
45  case EOperator::c_power: return std::pow(a, b);
46  default: throw std::runtime_error(std::string("Cannot apply operator ") + operatorToChar(op));
47  }
48  return 0;
49  }
50 
51  auto FormulaParserBase::checkNumber(ENumberStatus current, char next) -> ENumberStatus {
52  switch (current)
53  {
54  case ENumberStatus::c_Invalid:
55  // invalid stays invalid
56  return ENumberStatus::c_Invalid;
57  case ENumberStatus::c_Empty:
58  // numbers are allowed to start with digits, a dot or a sign
59  if (std::isdigit(next)) return ENumberStatus::c_Int;
60  if (next == '.') return ENumberStatus::c_LeadingDot;
61  if (next == '+' or next == '-') return ENumberStatus::c_Sign;
62  // everything else we don't like
63  return ENumberStatus::c_Invalid;
64  case ENumberStatus::c_Sign:
65  // if we started with a sign we can only go to digit and dots, no exponent
66  if (std::isdigit(next)) return ENumberStatus::c_Int;
67  if (next == '.') return ENumberStatus::c_Dot;
68  return ENumberStatus::c_Invalid;
69  case ENumberStatus::c_Int:
70  // So far it's a valid int consisting only of [sign +] digits, next
71  // stage is more digits, a . or an exponent
72  if (std::isdigit(next)) return ENumberStatus::c_Int;
73  if (next == '.') return ENumberStatus::c_Dot;
74  if (next == 'E' or next == 'e') return ENumberStatus::c_Exponent;
75  return ENumberStatus::c_Invalid;
76  case ENumberStatus::c_Dot:
77  // After the dot there can be more digits ... or a exponent
78  if (std::isdigit(next)) return ENumberStatus::c_Float;
79  if (next == 'E' or next == 'e') return ENumberStatus::c_Exponent;
80  return ENumberStatus::c_Invalid;
81  case ENumberStatus::c_LeadingDot:
82  // But if the dot was in the beginning then no exponent
83  if (std::isdigit(next)) return ENumberStatus::c_Float;
84  return ENumberStatus::c_Invalid;
85  // so, we saw some digits after the dot ... more digits or exponent it is
86  case ENumberStatus::c_Float:
87  if (std::isdigit(next)) return ENumberStatus::c_Float;
88  if (next == 'E' or next == 'e') return ENumberStatus::c_Exponent;
89  return ENumberStatus::c_Invalid;
90  case ENumberStatus::c_Exponent:
91  // and for the exponent we need either additional digits or a sign
92  if (std::isdigit(next)) return ENumberStatus::c_Scientific;
93  if (next == '+' or next == '-') return ENumberStatus::c_ExponentSign;
94  return ENumberStatus::c_Invalid;
95  case ENumberStatus::c_ExponentSign:
96  case ENumberStatus::c_Scientific:
97  // and after the exponent sign and any digit thereafter only digits are possible
98  if (std::isdigit(next)) return ENumberStatus::c_Scientific;
99  return ENumberStatus::c_Invalid;
100  }
101  return ENumberStatus::c_Invalid;
102  }
103 
104  void FormulaParserBase::assertOperatorUsable(size_t stacksize)
105  {
106  // we only have binary operators so we need two operands
107  if (stacksize < 1)
108  throw std::runtime_error("could not parse, stack of operands empty. Please report, this is most likely a bug");
109  if (stacksize < 2)
110  throw std::runtime_error("Missing operand");
111  }
112 
114  {
116  //the last thing we added was a variable so a bracket doesn't make sense
117  if (!m_lastTokenWasOperator) throw std::runtime_error("missing operator");
118  // otherwise, ont the stack it goes
119  m_operatorStack.push(op);
120  return;
121  }
123  // closing bracket. Look for a matching opening bracket and execute all
124  // operators until then
127  if (op == EOperator::c_squareBracketClose) std::swap(correct, wrong);
128  while (!m_operatorStack.empty()) {
129  EOperator tok = m_operatorStack.top();
130  m_operatorStack.pop();
131  if (tok == wrong) throw std::runtime_error("wrong type of closing bracket");
132  if (tok == correct) return;
133  executeOperator(tok);
134  }
135  // stack is empty, still no bracket
136  throw std::runtime_error("unmatched bracket");
137  }
138 
139  // Ok, now normal operators: there shouldn't be two in a row
140  if (m_lastTokenWasOperator) throw std::runtime_error("missing operand before operator");
141  m_lastTokenWasOperator = true;
142 
143  // The operator precedence is in the upper 4 bits ... hrhr
144  // TODO: make a function for this?
145  int op_precedence = (int)op >> 4;
146  while (!m_operatorStack.empty()) {
147  EOperator tok = m_operatorStack.top();
148  // Stop at brackets
150  int tok_precedence = (int)tok >> 4;
151  // Pow operator has right assiocativity, all others are left associative
152  // TODO: make nicer?
153  bool tok_right = op == EOperator::c_power;
154  // If the token has lower precedence or equal precedence but is right associative stop taking tokens
155  if (tok_precedence < op_precedence or (tok_precedence == op_precedence and tok_right)) break;
156  // otherwise pop and execute
157  executeOperator(tok);
158  m_operatorStack.pop();
159  }
160  m_operatorStack.push(op);
161  }
162 
164  {
165  while (!m_operatorStack.empty()) {
166  EOperator op = m_operatorStack.top();
167  m_operatorStack.pop();
168  // found a bracket but no more closing brackets to come ... so error
170  throw std::runtime_error("missing closing bracket");
172  }
173  }
174 
176  {
177  if (!m_currentVariableName.empty()) {
178  if (!m_lastTokenWasOperator) throw std::runtime_error("Missing operator before variable");
179  m_lastTokenWasOperator = false;
180  // looks like a number, so add a number
182  char* ptr;
183  double value;
184  value = std::strtod(m_currentVariableName.c_str(), &ptr);
185  addVariable(InputToken(value));
186  } else {
188  }
189  }
190  m_currentVariableName.clear();
192  }
193 
194  auto FormulaParserBase::checkForOperator(char next) -> EOperator {
195  if (next == '+' or next == '-')
196  {
197  // plus and minus are also part of literals so only treat it as operator
198  // if, together with the next character, this is not a valid float literal
199  auto isvalid = checkNumber(m_currentVariableNameNumberStatus, next);
200  if (isvalid != ENumberStatus::c_Invalid and checkNumber(isvalid, m_buffer.peek()) != ENumberStatus::c_Invalid) {
201  // this looks like a number don't interpret as operator
202  return EOperator::c_noop;
203  }
204  if (next == '+') return EOperator::c_plus;
205  if (next == '-') return EOperator::c_minus;
206  }
207  if (next == '/') return EOperator::c_divide;
208  if (next == '^') return EOperator::c_power;
209  if (next == '*')
210  {
211  // is it python style '**'? if yes, remove one char from stream and
212  // assume pow
213  if (m_buffer.peek() == '*') {
214  m_buffer.get();
215  return EOperator::c_power;
216  }
217  // otherwise multiply
218  return EOperator::c_multiply;
219  }
220  if (next == '(') return EOperator::c_roundBracketOpen;
221  if (next == ')') return EOperator::c_roundBracketClose;
222  if (next == '[') return EOperator::c_squareBracketOpen;
223  if (next == ']') return EOperator::c_squareBracketClose;
224  // no operator, so let's return just that
225  return EOperator::c_noop;
226  }
227 
228  void FormulaParserBase::processString(const std::string& formula)
229  {
230  // initialize buffer
231  m_buffer = std::istringstream(formula);
232  // clear stacks
233  std::stack<EOperator>().swap(m_operatorStack);
234  // and an empty identifier name
235  m_currentVariableName.clear();
237  // reset some other variable state
239  // and if the variable has arguments remember the nesting level of the ()
240  int nestlevel{0};
241  // Now loop over the whole formula character by character
242  for (char next; m_buffer.get(next);) {
243  // If nestlevel>0 we are in a variable(...) parameters area so ignore
244  // everything but keep track of how many open/closing brackets we saw
245  // until we are back to nestlevel=0
246  if (nestlevel > 0) {
247  m_currentVariableName += next;
248  if (next == '(') ++nestlevel;
249  if (next == ')') --nestlevel;
250  // finished variable arguments so variable is definitely done
251  if (nestlevel == 0) flushCurrentVariable();
252  // done with this character
253  continue;
254  }
255 
256  // check for opening parenthesis: could be variable arguments or operation binding
257  if (next == '(' and not m_currentVariableName.empty()) {
258  m_currentVariableName += next;
259  ++nestlevel;
260  // definitely not a number anymore
262  // done with this character
263  continue;
264  }
265 
266  // check for operator
267  auto opcode = checkForOperator(next);
268  if (opcode != EOperator::c_noop) {
269  // found operator, flush variable, add operator
271  addOperator(opcode);
272  // done with this character
273  continue;
274  }
275 
276  // check for whitespace
277  if (next == ' ' or next == '\n' or next == '\t' or next == '\r') {
278  // variable is finished, just flush here.
280  // otherwise nothing to do with whitespace ...
281  continue;
282  }
283 
284  // anything else is a identifier, most likely a variable name or a
285  // float literal now lets build up the variable name, first lets check
286  // if the variable name will still be a valid number
288  // then just add it to the state
289  m_currentVariableName += next;
290  }
291  if (nestlevel > 0) throw std::runtime_error("unterminated variable arguments");
292  // done with parsing, lets make sure everything is flushed
295  }
296 
297  void FormulaParserBase::raiseError(const std::runtime_error& e)
298  {
299  // So lets some fun printing the error message :D
300  std::ostringstream message;
301  // check where we stopped parsing
302  auto pos = m_buffer.tellg();
303  // -1 -> after the end
304  if (pos == -1) pos = m_buffer.str().size() + 1;
305  // basic boring message + reason
306  message << "Error parsing formula at character " << pos << ": " << e.what() << std::endl;
307  // now lets go through the formula again, line by line. YES, multi line formula are a thing
308  std::istringstream errbuff(m_buffer.str());
309  long lastpos = 0;
310  bool arrowShown{false};
311  for (std::string line; std::getline(errbuff, line);) {
312  // print each line
313  message << " " << line << std::endl;
314  // and get the position after each line
315  auto curpos = errbuff.tellg();
316  // if it's the last line or if we are now beyond the error position then print an arrow
317  if (!arrowShown && (curpos == -1 || curpos >= pos)) { // -1 = last line
318  // from the beginning to the line to the position of the error
319  for (long i = lastpos - 1; i < pos; ++i) message << "-";
320  message << "^" << std::endl;
321  // but only show it once
322  arrowShown = true;
323  }
324  lastpos = curpos;
325  }
326  throw std::runtime_error(message.str());
327  }
329 }
Belle2::FormulaParserBase::InputToken
std::variant< std::string, double > InputToken
Input token type: an input tokein is either a string or a float variable.
Definition: FormulaParser.h:69
Belle2::FormulaParserBase::m_currentVariableName
std::string m_currentVariableName
collect characters into a variable name
Definition: FormulaParser.h:115
Belle2::FormulaParserBase::EOperator::c_plus
@ c_plus
Addition.
Belle2::FormulaParserBase::EOperator::c_divide
@ c_divide
Division.
Belle2::FormulaParserBase::checkForOperator
EOperator checkForOperator(char next)
Check if the next character is a operator.
Definition: FormulaParser.cc:202
Belle2::FormulaParserBase::addVariable
virtual void addVariable(const InputToken &token)=0
Add a variable token to the current state.
Belle2::FormulaParserBase::m_operatorStack
std::stack< EOperator > m_operatorStack
Stack of operators for the Shunting-yard algorithm.
Definition: FormulaParser.h:119
Belle2::FormulaParserBase::ENumberStatus::c_Invalid
@ c_Invalid
Not a valid number.
Belle2::FormulaParserBase::EOperator::c_squareBracketOpen
@ c_squareBracketOpen
Open square bracket.
Belle2::FormulaParserBase::EOperator::c_power
@ c_power
Exponentation.
Belle2::FormulaParserBase::ENumberStatus::c_Empty
@ c_Empty
Empty string.
Belle2::FormulaParserBase::flushCurrentVariable
void flushCurrentVariable()
Flush the currently parsed variable name and add it to the state either as variable or number.
Definition: FormulaParser.cc:183
Belle2::FormulaParserBase::addOperator
void addOperator(EOperator op)
Add an operator to the internal state, convert them to reverse polish notation using the shunting yar...
Definition: FormulaParser.cc:121
Belle2::FormulaParserBase::EOperator::c_multiply
@ c_multiply
Multiply.
Belle2::FormulaParserBase::checkNumber
static ENumberStatus checkNumber(ENumberStatus current, char next)
Check if a string literal with a given number status continues to be a valid number if next is append...
Definition: FormulaParser.cc:59
Belle2::FormulaParserBase::EOperator::c_roundBracketClose
@ c_roundBracketClose
Close round bracket.
Belle2::FormulaParserBase::EOperator::c_noop
@ c_noop
No operation.
Belle2::FormulaParserBase::m_currentVariableNameNumberStatus
ENumberStatus m_currentVariableNameNumberStatus
State of the current variable name being a valid float literal.
Definition: FormulaParser.h:117
Belle2::FormulaParserBase::applyOperator
static double applyOperator(EOperator op, double a, double b)
Apply operator on two values.
Definition: FormulaParser.cc:46
Belle2::FormulaParserBase::m_lastTokenWasOperator
bool m_lastTokenWasOperator
Bool to check whether there were consecutive operators or variables.
Definition: FormulaParser.h:111
Belle2::FormulaParserBase::flushPendingOperators
void flushPendingOperators()
Flush all pending operators at the end of processing.
Definition: FormulaParser.cc:171
Belle2
Abstract base class for different kinds of events.
Definition: MillepedeAlgorithm.h:19
Belle2::FormulaParserBase::m_buffer
std::istringstream m_buffer
Buffer for the formula.
Definition: FormulaParser.h:113
Belle2::FormulaParserBase::processString
void processString(const std::string &formula)
Process the given formula and store the final state.
Definition: FormulaParser.cc:236
Belle2::FormulaParserBase::raiseError
void raiseError(const std::runtime_error &e)
Format the given runtime_error with context information and rethrow a new one.
Definition: FormulaParser.cc:305
Belle2::FormulaParserBase::EOperator
EOperator
List of known operators.
Definition: FormulaParser.h:41
Belle2::FormulaParserBase::EOperator::c_roundBracketOpen
@ c_roundBracketOpen
Open round bracket.
Belle2::FormulaParserBase::assertOperatorUsable
static void assertOperatorUsable(size_t stacksize)
Make sure we have enough operands to use an operator.
Definition: FormulaParser.cc:112
Belle2::FormulaParserBase::EOperator::c_minus
@ c_minus
Subtraction.
Belle2::FormulaParserBase::operatorToChar
static char operatorToChar(EOperator op) noexcept
Convert operator code to character.
Definition: FormulaParser.cc:29
Belle2::FormulaParserBase::executeOperator
virtual void executeOperator(EOperator op)=0
Execute an operator on the current state.
Belle2::FormulaParserBase::EOperator::c_squareBracketClose
@ c_squareBracketClose
Close square bracket.