TRIQS/TRIQS 4.0.0
Researching Interacting Quantum Systems
Loading...
Searching...
No Matches
mc_generic.cpp
Go to the documentation of this file.
1// Copyright (c) 2013-2018 Commissariat à l'énergie atomique et aux énergies alternatives (CEA)
2// Copyright (c) 2013-2018 Centre national de la recherche scientifique (CNRS)
3// Copyright (c) 2018-2023 Simons Foundation
4// Copyright (c) 2017 Hugo U.R. Strand
5//
6// This program is free software: you can redistribute it and/or modify
7// it under the terms of the GNU General Public License as published by
8// the Free Software Foundation, either version 3 of the License, or
9// (at your option) any later version.
10//
11// This program is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15//
16// You may obtain a copy of the License at
17// https://www.gnu.org/licenses/gpl-3.0.txt
18//
19// Authors: Michel Ferrero, Henri Menke, Olivier Parcollet, Priyanka Seth, Hugo U. R. Strand, Nils Wentzell, Thomas Ayral
20
25
26#include "./concepts.hpp"
27#include "./mc_generic.hpp"
30
31#include <fmt/format.h>
32#include <fmt/ranges.h>
33#include <mpi/mpi.hpp>
34#include <mpi/monitor.hpp>
35#include <mpi/vector.hpp>
36#include <nda/macros.hpp>
37
38#include <algorithm>
39#include <complex>
40#include <cstdint>
41#include <functional>
42#include <iostream>
43#include <memory>
44#include <stdexcept>
45
46namespace triqs::mc_tools {
47
48 template <DoubleOrComplex MCSignType> int mc_generic<MCSignType>::run(run_param_t const &params) {
49 EXPECTS(params.cycle_length > 0);
50 EXPECTS(params.stop_callback);
51 EXPECTS(params.after_cycle_duty);
52
53 // set the initial sign if specified
54 if (params.initial_sign != default_initial_sign) sign_ = params.initial_sign;
55
56 // reset move statistics
57 moves_.clear_statistics();
58
59 // start timer
60 run_timer_ = {};
61 run_timer_.start();
62
63 // start signal handler
65
66 // start MPI monitors
67 std::unique_ptr<mpi::monitor> exception_monitor;
68 if (params.propagate_exception and mpi::has_env) exception_monitor = std::make_unique<mpi::monitor>(params.comm);
69 std::unique_ptr<mpi::monitor> cycle_monitor;
70 if (params.continue_after_ncycles_done and mpi::has_env) cycle_monitor = std::make_unique<mpi::monitor>(params.comm);
71
72 // prepare the simulation parameters and statistics
73 auto const rank = params.comm.rank();
74 bool stop_flag = params.ncycles == 0;
75 std::int64_t cycle_counter = 1;
76 double next_print_info = 0.1;
77 double next_check_except = params.check_exception_interval;
78 double next_check_cycles = params.check_cycles_interval;
79 percentage_done_ = stop_flag ? 100 : 0;
80 nmeasures_done_ = 0;
81
82 // run simulation
83 for (; !stop_flag; ++cycle_counter) {
84 try {
85 // do cycle_length MC steps / cycle
86 for (std::int64_t i = 0; i < params.cycle_length; ++i) {
88 // Metropolis step
89 metropolis_step();
90 ++config_id_;
91 }
92 // skip after-cycle duties in overtime cycles: they may communicate and have to run
93 // exactly ncycles times on every rank (see run_param_t::continue_after_ncycles_done)
94 bool const in_overtime = params.ncycles > 0 and cycle_counter > params.ncycles;
95 if (not in_overtime) after_cycle_duties(params);
96 if (params.enable_measures) do_measurements();
97 } catch (triqs::signal_handler::exception const &) {
98 // current cycle is interrupted, simulation is stopped below
99 std::cerr << fmt::format("[Rank {}] Signal caught in mc_generic::run: Stopping the simulation.\n", rank);
100 } catch (std::exception const &err) {
101 // log the exception and node number, either abort or report to other nodes
102 std::cerr << fmt::format("[Rank {}] Error int mc_generic::run: Exception occured:\n{}\n", rank, err.what());
103 if (exception_monitor) {
104 exception_monitor->report_local_event();
105 break;
106 } else {
107 params.comm.abort(2);
108 }
109 }
110
111 // recompute fraction done and runtime so far
112 percentage_done_ = static_cast<double>(cycle_counter) * 100.0 / params.ncycles;
113 double runtime = run_timer_;
114
115 // is it time to print simulation info
116 if (runtime > next_print_info) {
117 // increase time interval non-linearly until next print and print info
118 next_print_info = 1.25 * runtime + 2.0;
119 print_sim_info(params, cycle_counter);
120 }
121
122 // is it time to check for exceptions on other ranks
123 if (exception_monitor && runtime > next_check_except) {
124 // increase time until next check and check other ranks
125 next_check_except += params.check_exception_interval;
126 stop_flag |= exception_monitor->event_on_any_rank();
127 }
128
129 // have we done all requested cycles
130 if (percentage_done_ >= 100) {
131 if (cycle_monitor) {
132 // if continue_after_ncycles_done == true, report to other ranks
133 cycle_monitor->report_local_event();
134 // is it time to check if the other ranks are finished as well
135 if (runtime > next_check_cycles) {
136 // increase time until next check and check other ranks
137 next_check_cycles += params.check_cycles_interval;
138 stop_flag |= cycle_monitor->event_on_all_ranks();
139 }
140 } else {
141 // if continue_after_ncycles_done == false, stop the simulation
142 stop_flag = true;
143 }
144 }
145
146 // update stop flag
147 stop_flag |= (params.stop_callback() || triqs::signal_handler::received());
148 }
149
150 // stop timer
151 run_timer_.stop();
152
153 // update phase-specific timer
154 switch (params.phase) {
155 case mc_phase::warmup: warmup_timer_ = run_timer_; break;
156 case mc_phase::accumulation: acc_timer_ = run_timer_; break;
157 }
158
159 // update statistics
160 --cycle_counter;
161 ncycles_done_ += cycle_counter;
162
163 // print final simulation info
164 print_sim_info(params, cycle_counter);
165
166 // stop signal handler
167 int status = (percentage_done_ >= 100 ? 0 : (triqs::signal_handler::received() ? 2 : 1));
169
170 // stop MPI monitors
171 if (exception_monitor) {
172 exception_monitor->finalize_communications();
173 if (exception_monitor->event_on_any_rank())
174 throw std::runtime_error(fmt::format("[Rank {}] MC simulation stopped because an exception occurred on one of the MPI ranks", rank));
175 }
176 if (cycle_monitor) cycle_monitor->finalize_communications();
177
178 // final reports
179 if (status == 1) report_ << fmt::format("[Rank {}] MC simulation stopped because stop_callback() returned true\n", rank);
180 if (status == 2) report_ << fmt::format("[Rank {}] MC simulation stopped because a signal has been received\n", rank);
181
182 return status;
183 }
184
185 template <DoubleOrComplex MCSignType> int mc_generic<MCSignType>::warmup(run_param_t const &params) {
186 report_(3) << fmt::format("[Rank {}] Performing warmup phase...\n", params.comm.rank());
187 auto p = params;
188 p.enable_measures = false;
189 p.phase = mc_phase::warmup;
190 return run(p);
191 }
192
193 template <DoubleOrComplex MCSignType> int mc_generic<MCSignType>::accumulate(run_param_t const &params) {
194 report_(3) << fmt::format("[Rank {}] Performing accumulation phase...\n", params.comm.rank());
195 auto p = params;
196 p.enable_measures = true;
197 p.enable_calibration = false;
198 p.phase = mc_phase::accumulation;
199 return run(p);
200 }
201
202 template <DoubleOrComplex MCSignType>
203 int mc_generic<MCSignType>::run(std::int64_t ncycles, std::int64_t cycle_length, std::function<bool()> stop_callback, bool enable_measures,
204 mpi::communicator c, bool enable_calibration) {
205 return run({.ncycles = ncycles,
206 .cycle_length = cycle_length,
207 .stop_callback = stop_callback,
208 .comm = c,
209 .enable_measures = enable_measures,
210 .enable_calibration = enable_calibration,
211 .phase = enable_measures ? mc_phase::accumulation : mc_phase::warmup});
212 }
213
214 template <DoubleOrComplex MCSignType>
215 int mc_generic<MCSignType>::warmup(std::int64_t ncycles, std::int64_t cycle_length, std::function<bool()> stop_callback, MCSignType initial_sign,
216 mpi::communicator c) {
217 return warmup({.ncycles = ncycles, .cycle_length = cycle_length, .stop_callback = stop_callback, .initial_sign = initial_sign, .comm = c});
218 }
219
220 template <DoubleOrComplex MCSignType>
221 int mc_generic<MCSignType>::warmup(std::int64_t ncycles, std::int64_t cycle_length, std::function<bool()> stop_callback, mpi::communicator c) {
222 return warmup({.ncycles = ncycles, .cycle_length = cycle_length, .stop_callback = stop_callback, .comm = c});
223 }
224
225 template <DoubleOrComplex MCSignType>
226 int mc_generic<MCSignType>::accumulate(std::int64_t ncycles, std::int64_t cycle_length, std::function<bool()> stop_callback, mpi::communicator c) {
227 return accumulate({.ncycles = ncycles, .cycle_length = cycle_length, .stop_callback = stop_callback, .comm = c});
228 }
229
230 template <DoubleOrComplex MCSignType>
231 int mc_generic<MCSignType>::warmup_and_accumulate(std::int64_t ncycles_warmup, std::int64_t ncycles_acc, std::int64_t cycle_length,
232 std::function<bool()> stop_callback, MCSignType initial_sign, mpi::communicator c) {
233 auto status =
234 warmup({.ncycles = ncycles_warmup, .cycle_length = cycle_length, .stop_callback = stop_callback, .initial_sign = initial_sign, .comm = c});
235 if (status == 0) status = accumulate({.ncycles = ncycles_acc, .cycle_length = cycle_length, .stop_callback = stop_callback, .comm = c});
236 return status;
237 }
238
239 template <DoubleOrComplex MCSignType>
240 int mc_generic<MCSignType>::warmup_and_accumulate(std::int64_t ncycles_warmup, std::int64_t ncycles_acc, std::int64_t cycle_length,
241 std::function<bool()> stop_callback, mpi::communicator c) {
242 return warmup_and_accumulate(ncycles_warmup, ncycles_acc, cycle_length, stop_callback, default_initial_sign, c);
243 }
244
245 template <DoubleOrComplex MCSignType> void mc_generic<MCSignType>::collect_results(mpi::communicator const &c) {
246 report_(3) << fmt::format("[Rank {}] Collect results: Waiting for all MPI processes to finish accumulating...\n", c.rank());
247
248 // collect results from all MPI processes
249 measures_.collect_results(c);
250 moves_.collect_statistics(c);
251 auto tot_nmeasures = mpi::reduce(nmeasures_done_, c);
252 auto tot_duration = mpi::reduce(get_accumulation_time(), c);
253
254 // generate rank dependent output string (skipped on ranks where it would not be printed;
255 // gather below is still called on every rank with an empty contribution to keep the collective safe)
256 std::string info;
257 if (verbosity_lvl_ >= 3) {
258 info = "\n";
259 info += fmt::format("[Rank {}] Warmup duration: {:.4f} seconds [{}]\n", c.rank(), get_warmup_time(), get_warmup_time_HHMMSS());
260 info += fmt::format("[Rank {}] Simulation duration: {:.4f} seconds [{}]\n", c.rank(), get_accumulation_time(), get_accumulation_time_HHMMSS());
261 info += fmt::format("[Rank {}] Number of measures: {}\n", c.rank(), nmeasures_done_);
262 info += fmt::format("[Rank {}] Cycles (measures) / second: {:.2e}\n", c.rank(), nmeasures_done_ / get_accumulation_time());
263 info += fmt::format("[Rank {}] Measurement durations (total = {:.4f}):\n{}", c.rank(), measures_.total_duration(),
264 measures_.get_timings(fmt::format("[Rank {}] ", c.rank())));
265 info += fmt::format("[Rank {}] Move durations (total = {:.4f}):\n{}", c.rank(), moves_.total_duration(),
266 moves_.get_timings(fmt::format("[Rank {}] ", c.rank())));
267 info += fmt::format("[Rank {}] Move statistics:\n{}", c.rank(), moves_.get_statistics(fmt::format("[Rank {}] ", c.rank())));
268 }
269
270 // gather all output strings on rank 0 to print in order (collective; ranks with low verbosity contribute "")
271 auto all_infos = mpi::gather(info, c);
272 if (c.rank() == 0) {
273 report_(3) << all_infos;
274 std::string more_info{"\n"};
275 more_info += fmt::format("Total number of measures: {}\n", tot_nmeasures);
276 more_info += fmt::format("Total cycles (measures) / second: {:.2e}\n", tot_nmeasures / tot_duration);
277 report_(2) << more_info;
278 }
279 }
280
281 template <DoubleOrComplex MCSignType> void mc_generic<MCSignType>::print_sim_info(run_param_t const &params, std::int64_t cycle_counter) {
282 // current simulation parameters
283 auto const rank = params.comm.rank();
284 double const runtime = run_timer_;
285 auto const cycles_per_sec = static_cast<double>(cycle_counter) / runtime;
286
287 // do the printing
288 if (percentage_done_ < 0) {
289 report_(3) << fmt::format("[Rank {}] {} cycle {}, {:.2e} cycles/sec\n", rank, utility::timestamp(), cycle_counter, cycles_per_sec);
290 } else {
291 report_(3) << fmt::format("[Rank {}] {} {:>6.2f}% done, ETA {}, cycle {} of {}, {:.2e} cycles/sec\n", rank, utility::timestamp(),
292 percentage_done_, utility::estimate_time_left(params.ncycles, cycle_counter, run_timer_), cycle_counter,
293 params.ncycles, cycles_per_sec);
294 }
295 if (params.enable_measures) report_(3) << measures_.report();
296 }
297
298 template <DoubleOrComplex MCSignType> void mc_generic<MCSignType>::metropolis_step() {
299 double r = moves_.attempt();
300 if (rng_() < std::min(1.0, r)) {
301 sign_ *= moves_.accept();
302 } else {
303 moves_.reject();
304 }
305 }
306
307 template <DoubleOrComplex MCSignType> void mc_generic<MCSignType>::after_cycle_duties(run_param_t const &params) {
308 params.after_cycle_duty();
309 if (params.enable_calibration) moves_.calibrate(params.comm);
310 }
311
312 template <DoubleOrComplex MCSignType> void mc_generic<MCSignType>::do_measurements() {
313 ++nmeasures_done_;
314 for (auto &m : measures_aux_) m();
315 measures_.accumulate(sign_);
316 }
317
318 // Explicit template instantiations.
319 template class mc_generic<double>;
320 template class mc_generic<std::complex<double>>;
321
322} // namespace triqs::mc_tools
Generic Monte Carlo class.
auto get_warmup_time_HHMMSS() const
Get the time spent in the warmup phase in hours, minutes, and seconds.
int warmup_and_accumulate(std::int64_t ncycles_warmup, std::int64_t ncycles_acc, std::int64_t cycle_length, std::function< bool()> stop_callback, MCSignType initial_sign, mpi::communicator c=mpi::communicator{})
Run the warumup and accumulation phases of the MC simulation.
double get_accumulation_time() const
Get the time spent in the accumulation phase in seconds.
double get_warmup_time() const
Get the time spent in the warmup phase in seconds.
int run(run_param_t const &params)
Run a generic MC simulation.
int warmup(run_param_t const &params)
Run the warumup phase of the MC simulation.
int accumulate(run_param_t const &params)
Run the accumulations phase of the MC simulation.
void collect_results(mpi::communicator const &c)
Collect results from multiple MPI processes.
std::string get_accumulation_time_HHMMSS() const
Get the time spent in the accumulation phase in hours, minutes and seconds.
Empty exception type that callers may use to signal a graceful shutdown.
std::string estimate_time_left(int N, int n, timer &t)
Linear extrapolation of the remaining time of a loop, formatted as HH:MM:SS.
Definition timestamp.hpp:84
std::string timestamp()
Current local time formatted as HH:MM:SS.
Definition timestamp.hpp:47
Provides a generic class to run Monte Carlo simulations.
Provides concepts for the MC tools.
void start()
Install the TRIQS signal handler.
void stop()
Restore the previous signal disposition.
bool received(bool pop_)
Whether at least one signal has been queued since the last reset.
Provides a signal handler for the TRIQS library.
User-provided MC simulation parameters for the run() method.
std::int64_t ncycles
Number of MC cycles to run (< 1 to run indefinitely).
bool continue_after_ncycles_done
Should we continue the simulation on the current rank after the given number of cycles is done and wa...
std::function< void()> after_cycle_duty
Callback function that is executed after each of the first ncycles cycles.
MCSignType initial_sign
The sign of the MC weight is initialized to this value before the simulation starts (if it is !...
double check_exception_interval
Time interval (in seconds) after which the simulation checks for exceptions on other nodes.
std::function< bool()> stop_callback
Optional callback function that returns a boolean value indicating if we should stop the simulation....
mpi::communicator comm
MPI communicator.
std::int64_t cycle_length
Number of MC steps/moves per cycle.
bool propagate_exception
Should we propagate exceptions to all MPI ranks or abort the simulation immediately?
double check_cycles_interval
Time interval (in seconds) after which the simulation checks if all other nodes have finished their c...
bool enable_measures
Should we take measurements during the simulation? Usually false during the warmup phase.
Small helpers that format wall-clock timestamps and durations for human-readable logs.