Belle II Software development
Downloader.cc
1/**************************************************************************
2 * basf2 (Belle II Analysis Software Framework) *
3 * Author: The Belle II Collaboration *
4 * *
5 * See git log for contributors and copyright holders. *
6 * This file is licensed under LGPL-3.0, see LICENSE.md. *
7 **************************************************************************/
8
9#include <framework/database/Downloader.h>
10#include <framework/core/RandomNumbers.h>
11#include <framework/gearbox/Unit.h>
12#include <framework/logging/Logger.h>
13#include <framework/utilities/Utils.h>
14#include <framework/utilities/EnvironmentVariables.h>
15
16#include <curl/curl.h>
17#include <TMD5.h>
18
19#include <boost/algorithm/string.hpp>
20
21#include <chrono>
22#include <functional>
23#include <limits>
24#include <thread>
25
26namespace Belle2::Conditions {
28 struct CurlSession {
30 CURL* curl{nullptr};
32 curl_slist* headers{nullptr};
34 char errbuf[CURL_ERROR_SIZE];
36 double lasttime{0};
37 };
38
39 namespace {
50 size_t write_function(void* buffer, size_t size, size_t nmemb, void* userp)
51 {
52 // size in bytes is size*nmemb so copy the correct amount and return it to curl
53 try {
54 std::ostream& stream = *static_cast<std::ostream*>(userp);
55 stream.write(static_cast<const char*>(buffer), size * nmemb);
56 } catch (std::ios_base::failure& e) {
57 B2ERROR("Writing error while downloading: " << e.code().message() << '(' << e.code().value() << ')');
58 return 0;
59 }
60 return size * nmemb;
61 }
62
73 int progress_callback(void* clientp, curl_off_t dltotal, curl_off_t dlnow,
74 __attribute((unused)) curl_off_t ultotal, __attribute((unused)) curl_off_t ulnow)
75 {
76 // nothing to show ...
77 if (dlnow == 0) return 0;
78 // otherwise print number of transferred bytes
79 CurlSession& status = *static_cast<CurlSession*>(clientp);
80 double time = Utils::getClock();
81 // make sure we don't print the status too often
82 if (status.lasttime != 0 && (time - status.lasttime) / Unit::ms < 200) {
83 return 0;
84 }
85 status.lasttime = time;
86 if (dltotal > 0) {
87 B2DEBUG(39, "curl:= " << dlnow << " / " << dltotal << " bytes transferred");
88 } else {
89 B2DEBUG(39, "curl:= " << dlnow << " bytes transferred");
90 }
91 return 0;
92 }
93
104 int debug_callback([[maybe_unused]] CURL* handle, curl_infotype type, char* data, size_t size,
105 [[maybe_unused]] void* userptr)
106 {
107 std::string prefix = "curl:";
108 // Choose loglevel: if type is CURLINFO_TEXT the messages are general
109 // informations about what curl is doing. The more detailed information
110 // about incoming/outgoing headers is a bit less important so give it a
111 // higher log level.
112 int level = 39;
113 if (type == CURLINFO_TEXT) { prefix += "*"; level = 38; }
114 else if (type == CURLINFO_HEADER_OUT) prefix += ">";
115 else if (type == CURLINFO_HEADER_IN) prefix += "<";
116 else return 0;
117 // Convert char* data to a string and strip whitespace ...
118 std::string message(data, size);
119 boost::trim(message);
120 // And log if there's something left
121 if (!message.empty()) B2DEBUG(level, prefix << " " << message);
122 return 0;
123 }
124
126 std::string getUserAgent()
127 {
128 return "BASF2/" + ::Belle2::EnvironmentVariables::get("BELLE2_RELEASE", "unknown");
129 }
130 }
131 /* We only want to initialize curl once */
132 bool Downloader::s_globalInit{false};
133
135 {
136 static Downloader instance;
137 return instance;
138 }
139
141
142 std::string Downloader::escapeString(const std::string& text)
143 {
144 //make sure we have an active curl session ...
145 auto session = ensureSession(); // cppcheck-suppress unreadVariable
146 char* escaped = curl_easy_escape(m_session->curl, text.c_str(), text.size());
147 if (!escaped) {
148 throw std::runtime_error("Could not escape string");
149 }
150 std::string escapedStr{escaped};
151 curl_free(escaped);
152 return escapedStr;
153 }
154
156 std::string Downloader::joinWithSlash(const std::string& base, const std::string& rest)
157 {
158 return boost::trim_right_copy_if(base, boost::is_any_of("/")) + "/" +
159 boost::trim_left_copy_if(rest, boost::is_any_of("/"));
160 }
161
163 {
164 // start a curl session but if there is already one return false
165 if (m_session) return false;
166 // make sure curl is initialized correctly
167 if (!s_globalInit) {
168 curl_global_init(CURL_GLOBAL_ALL);
169 s_globalInit = true;
170 }
171 // create the curl session
172 m_session = std::make_unique<CurlSession>();
173 m_session->curl = curl_easy_init();
174 if (!m_session->curl) {
175 B2FATAL("Cannot initialize libcurl");
176 }
177 m_session->headers = curl_slist_append(nullptr, "Accept: application/json");
178 curl_easy_setopt(m_session->curl, CURLOPT_HTTPHEADER, m_session->headers);
179 curl_easy_setopt(m_session->curl, CURLOPT_TCP_KEEPALIVE, 1L);
180 curl_easy_setopt(m_session->curl, CURLOPT_CONNECTTIMEOUT, m_connectionTimeout);
181 curl_easy_setopt(m_session->curl, CURLOPT_LOW_SPEED_LIMIT, 10 * 1024); //10 kB/s
182 curl_easy_setopt(m_session->curl, CURLOPT_LOW_SPEED_TIME, m_stalledTimeout);
183 curl_easy_setopt(m_session->curl, CURLOPT_WRITEFUNCTION, write_function);
184 curl_easy_setopt(m_session->curl, CURLOPT_VERBOSE, 1);
185 curl_easy_setopt(m_session->curl, CURLOPT_NOPROGRESS, 0);
186 curl_easy_setopt(m_session->curl, CURLOPT_DEBUGFUNCTION, debug_callback);
187 curl_easy_setopt(m_session->curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
188 curl_easy_setopt(m_session->curl, CURLOPT_XFERINFODATA, m_session.get());
189 curl_easy_setopt(m_session->curl, CURLOPT_FAILONERROR, true);
190 curl_easy_setopt(m_session->curl, CURLOPT_ERRORBUFFER, m_session->errbuf);
191 // enable transparent compression support
192 curl_easy_setopt(m_session->curl, CURLOPT_ACCEPT_ENCODING, "");
193 // Set proxy if defined
194 if (EnvironmentVariables::isSet("BELLE2_CONDB_PROXY")) {
195 const std::string proxy = EnvironmentVariables::get("BELLE2_CONDB_PROXY");
196 curl_easy_setopt(m_session->curl, CURLOPT_PROXY, proxy.c_str());
197 }
198 curl_easy_setopt(m_session->curl, CURLOPT_AUTOREFERER, 1L);
199 curl_easy_setopt(m_session->curl, CURLOPT_FOLLOWLOCATION, 1L);
200 curl_easy_setopt(m_session->curl, CURLOPT_MAXREDIRS, 10L);
201 curl_easy_setopt(m_session->curl, CURLOPT_TCP_FASTOPEN, 0L);
202 curl_easy_setopt(m_session->curl, CURLOPT_SSL_VERIFYPEER, 0L);
203 curl_easy_setopt(m_session->curl, CURLOPT_SSL_VERIFYHOST, 0L);
204 curl_easy_setopt(m_session->curl, CURLOPT_SSL_VERIFYSTATUS, 0L);
205 curl_easy_setopt(m_session->curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_WHATEVER);
206 // Don't cache DNS entries, ask the system every time we need to connect ...
207 curl_easy_setopt(m_session->curl, CURLOPT_DNS_CACHE_TIMEOUT, 0L);
208 // and shuffle the addresses so we try a different node, otherwise we might
209 // always get the same address due to system caching and RFC 3484
210 curl_easy_setopt(m_session->curl, CURLOPT_DNS_SHUFFLE_ADDRESSES, 1L);
211 auto version = getUserAgent();
212 curl_easy_setopt(m_session->curl, CURLOPT_USERAGENT, version.c_str());
213 return true;
214 }
215
217 {
218 // if there's a session clean it ...
219 if (m_session) {
220 curl_easy_cleanup(m_session->curl);
221 curl_slist_free_all(m_session->headers);
222 m_session.reset();
223 }
224 }
225
226 std::string Downloader::calculateChecksum(std::istream& input)
227 {
228 // rewind stream
229 input.clear();
230 input.seekg(0, std::ios::beg);
231 // and calculate md5 checksum by feeding it blockwise to the TMD5 update
232 TMD5 md5;
233 char buffer[4096];
234 while (input.good()) {
235 input.read(buffer, 4096);
236 if (input.gcount() == 0) break;
237 md5.Update((unsigned char*)buffer, input.gcount());
238 }
239 // finalize and return output
240 md5.Final();
241 return md5.AsString();
242 }
243
244 void Downloader::setConnectionTimeout(unsigned int timeout)
245 {
246 m_connectionTimeout = timeout;
247 if (m_session) {
248 curl_easy_setopt(m_session->curl, CURLOPT_CONNECTTIMEOUT, m_connectionTimeout);
249 }
250 }
251
252 void Downloader::setStalledTimeout(unsigned int timeout)
253 {
254 m_stalledTimeout = timeout;
255 if (m_session) {
256 curl_easy_setopt(m_session->curl, CURLOPT_LOW_SPEED_TIME, m_stalledTimeout);
257 }
258 }
259
260 bool Downloader::download(const std::string& url, std::ostream& buffer, bool silentOnMissing)
261 {
262 // make sure we have an active curl session ...
263 auto session = ensureSession();
264 // and initialize the internal random number generator
266 B2DEBUG(37, "Download started ..." << LogVar("url", url));
267 // we might need to try a few times in case of HTTP error >= 300
268 for (unsigned int retry{1};; ++retry) {
269 //rewind the stream to the beginning
270 buffer.clear();
271 buffer.seekp(0, std::ios::beg);
272 if (!buffer.good()) {
273 throw std::runtime_error("cannot write to stream");
274 }
275 // Set the exception flags to notify us of any problem during writing
276 auto oldExceptionMask = buffer.exceptions();
277 buffer.exceptions(std::ios::failbit | std::ios::badbit);
278 // build the request ...
279 CURLcode res{CURLE_FAILED_INIT};
280 // and set all the curl options
281 curl_easy_setopt(m_session->curl, CURLOPT_URL, url.c_str());
282 curl_easy_setopt(m_session->curl, CURLOPT_WRITEDATA, &buffer);
283 // perform the request ...
284 res = curl_easy_perform(m_session->curl);
285 // flush output
286 buffer.exceptions(oldExceptionMask);
287 buffer.flush();
288 // and check for errors which occurred during download ...
289 if (res != CURLE_OK) {
290 size_t len = strlen(m_session->errbuf);
291 const std::string error = len ? m_session->errbuf : curl_easy_strerror(res);
292 if (m_maxRetries > 0 && res == CURLE_HTTP_RETURNED_ERROR) {
293 if (retry <= m_maxRetries) {
294 // we treat everything below 300 as permanent error with the request,
295 // while if 300 or above we retry
296 // 404 corresponds to Not Found and we want to treat it differently
297 long responseCode{0};
298 curl_easy_getinfo(m_session->curl, CURLINFO_RESPONSE_CODE, &responseCode);
299 if (responseCode >= 300 and responseCode != 404) {
300 // use exponential backoff but don't restrict to exact slots like
301 // Ethernet, just use a random wait time between 1s and maxDelay =
302 // 2^(retry)-1 * backoffFactor
303 double maxDelay = (std::pow(2, retry) - 1) * m_backoffFactor;
304 // This is an exception in the whole basf2: instead of relying on gRandom for getting a random number,
305 // we rely on a different random number generator, and the reason is:
306 // since the request may fail because of several reasons independent from basf2 (bad connection,
307 // faulty squid cache, etc.), we might retry a new request altering the internal state of the gRandom
308 // instance, spoiling our capability to fully reproduce our results.
309 // In this way, relying on a different generator, we are safe.
310 m_rndDistribution->param(std::uniform_real_distribution<double>::param_type(1.0, maxDelay));
311 double seconds = (*m_rndDistribution)(*m_rnd);
312 B2WARNING("Could not download url, retrying ..."
313 << LogVar("url", url) << LogVar("error", error)
314 << LogVar("try", retry) << LogVar("waiting time", seconds));
315 std::this_thread::sleep_for(std::chrono::milliseconds((int)(seconds * 1e3)));
316 continue;
317 }
318 // special treatment for 404: if silentOnMissing is true we just return false silently
319 // this is useful when checking if a file exists on the server
320 if (responseCode == 404 and silentOnMissing) return false;
321 }
322 }
323 throw std::runtime_error(error);
324 }
325 break;
326 }
327 // all fine
328 B2DEBUG(37, "Download finished successfully." << LogVar("url", url));
329 return true;
330 }
331
333 {
334 if (not m_rndIsInitialized) {
335 // We need to provide a seed for m_rnd: let's take the basf2Seed and hash it
336 auto downloaderSeed = std::hash<std::string> {}(RandomNumbers::getSeed());
337 m_rnd->seed(downloaderSeed);
338 m_rndIsInitialized = true;
339 }
340 }
341} // namespace Belle2::Conditions
Simple class to encapsulate libcurl as used by the ConditionsDatabase.
Definition: Downloader.h:22
void finishSession()
Finish an existing curl session if any is active at the moment.
Definition: Downloader.cc:216
static bool s_globalInit
flag to indicate whether curl has been initialized already
Definition: Downloader.h:98
std::unique_ptr< std::uniform_real_distribution< double > > m_rndDistribution
A uniform real distribution for extracting random numbers.
Definition: Downloader.h:121
bool startSession()
Start a new curl session if none is active at the moment.
Definition: Downloader.cc:162
void initializeRandomGeneratorSeed()
Initialize the seed of the internal random number generator.
Definition: Downloader.cc:332
bool download(const std::string &url, std::ostream &stream, bool silentOnMissing=false)
get an url and save the content to stream This function raises exceptions when there are any problems
Definition: Downloader.cc:260
unsigned int m_maxRetries
Number of retries to perform when downloading fails with HTTP response code >=300.
Definition: Downloader.h:104
unsigned int m_connectionTimeout
Timeout to wait for connections in seconds.
Definition: Downloader.h:100
void setStalledTimeout(unsigned int timeout)
Set the timeout to wait for stalled connections (<10KB/s), 0 disables timeout.
Definition: Downloader.cc:252
std::unique_ptr< CurlSession > m_session
curl session handle
Definition: Downloader.h:96
unsigned int m_stalledTimeout
Timeout to wait for stalled connections (<10KB/s)
Definition: Downloader.h:102
bool m_rndIsInitialized
Flag for keeping track if the internal random generator is correctly initialized or not.
Definition: Downloader.h:123
std::unique_ptr< std::mt19937 > m_rnd
This is a special exception in basf2 where an instance of gRandom is NOT used: since this class inter...
Definition: Downloader.h:119
std::string joinWithSlash(const std::string &base, const std::string &second)
Join two strings and make sure that there is exactly one '/' between them.
Definition: Downloader.cc:156
static std::string calculateChecksum(std::istream &input)
calculate the digest/checksum on a given string.
Definition: Downloader.cc:226
void setConnectionTimeout(unsigned int timeout)
Set the timeout to wait for connections in seconds, 0 means built in curl default.
Definition: Downloader.cc:244
std::string escapeString(const std::string &text)
Escape a string to make it safe to be used in web requests.
Definition: Downloader.cc:142
ScopeGuard ensureSession()
Make sure there's an active session and return a ScopeGuard object that closes the session on destruc...
Definition: Downloader.h:43
unsigned int m_backoffFactor
Backoff factor for retries in seconds.
Definition: Downloader.h:106
static Downloader & getDefaultInstance()
Return the default instance.
Definition: Downloader.cc:134
static std::string getSeed()
Get the random number generator seed.
Definition: RandomNumbers.h:92
static const double ms
[millisecond]
Definition: Unit.h:96
Class to store variables with their name which were sent to the logging service.
static std::string get(const std::string &name, const std::string &fallback="")
Get the value of an environment variable or the given fallback value if the variable is not set.
static bool isSet(const std::string &name)
Check if a value is set in the database.
double getClock()
Return current value of the real-time clock.
Definition: Utils.cc:66
struct encapsulating all the state information needed by curl
Definition: Downloader.cc:28
CURL * curl
curl session information
Definition: Downloader.cc:30
double lasttime
last time we printed the status (in ns)
Definition: Downloader.cc:36
curl_slist * headers
headers to send with every request
Definition: Downloader.cc:32
char errbuf[CURL_ERROR_SIZE]
error buffer in case some error happens during downloading
Definition: Downloader.cc:34