27 const auto& scalars = msm_data.scalars;
28 const auto& points = msm_data.points;
29 const auto& scalar_indices = msm_data.scalar_indices;
30 const size_t range = scalar_indices.size();
33 for (
size_t i = 0; i < range; ++i) {
35 r += f * scalars[scalar_indices[i]].to_montgomery_form();
40template <
typename Curve>
42 std::vector<uint32_t>& nonzero_scalar_indices)
noexcept
49 auto range = chunk.
range(scalars.size());
53 std::vector<uint32_t>& thread_scalar_indices = thread_indices[chunk.
thread_index];
54 thread_scalar_indices.reserve(range.size());
55 for (
size_t i : range) {
57 auto& scalar = scalars[i];
58 scalar.self_from_montgomery_form();
60 if (!scalar.is_zero()) {
61 thread_scalar_indices.push_back(
static_cast<uint32_t
>(i));
66 size_t num_entries = 0;
67 for (
const auto& indices : thread_indices) {
68 num_entries += indices.size();
70 nonzero_scalar_indices.resize(num_entries);
77 offset += thread_indices[i].size();
85template <
typename Curve>
87 std::span<
std::span<ScalarField>> scalars, std::vector<std::vector<uint32_t>>& msm_scalar_indices)
noexcept
90 const size_t num_msms = scalars.size();
91 msm_scalar_indices.resize(num_msms);
92 for (
size_t i = 0; i < num_msms; ++i) {
93 transform_scalar_and_get_nonzero_scalar_indices(scalars[i], msm_scalar_indices[i]);
96 size_t total_work = 0;
97 for (
const auto& indices : msm_scalar_indices) {
98 total_work += indices.size();
105 const size_t work_of_last_thread = total_work - (work_per_thread * (num_threads - 1));
108 if (num_threads > total_work) {
109 for (
size_t i = 0; i < num_msms; ++i) {
113 .size = msm_scalar_indices[i].size(),
119 size_t thread_accumulated_work = 0;
120 size_t current_thread_idx = 0;
121 for (
size_t i = 0; i < num_msms; ++i) {
122 size_t msm_work_remaining = msm_scalar_indices[i].size();
123 const size_t initial_msm_work = msm_work_remaining;
125 while (msm_work_remaining > 0) {
128 const size_t total_thread_work =
129 (current_thread_idx == num_threads - 1) ? work_of_last_thread : work_per_thread;
130 const size_t available_thread_work = total_thread_work - thread_accumulated_work;
131 const size_t work_to_assign = std::min(available_thread_work, msm_work_remaining);
133 work_units[current_thread_idx].push_back(
MSMWorkUnit{
135 .start_index = initial_msm_work - msm_work_remaining,
136 .size = work_to_assign,
139 thread_accumulated_work += work_to_assign;
140 msm_work_remaining -= work_to_assign;
143 if (thread_accumulated_work >= total_thread_work) {
144 current_thread_idx++;
145 thread_accumulated_work = 0;
167template <
typename Curve>
170 size_t slice_size)
noexcept
172 constexpr size_t LIMB_BITS = 64;
174 size_t hi_bit = NUM_BITS_IN_FIELD - (round * slice_size);
175 size_t lo_bit = (hi_bit < slice_size) ? 0 : hi_bit - slice_size;
180 size_t start_limb = lo_bit / LIMB_BITS;
181 size_t end_limb = hi_bit / LIMB_BITS;
182 size_t lo_slice_offset = lo_bit & (LIMB_BITS - 1);
183 size_t actual_slice_size = hi_bit - lo_bit;
184 size_t lo_slice_bits =
185 (LIMB_BITS - lo_slice_offset < actual_slice_size) ? (LIMB_BITS - lo_slice_offset) : actual_slice_size;
186 size_t hi_slice_bits = actual_slice_size - lo_slice_bits;
188 uint64_t lo_slice = (scalar.data[start_limb] >> lo_slice_offset) & ((1ULL << lo_slice_bits) - 1);
189 uint64_t hi_slice = (start_limb != end_limb) ? (scalar.data[end_limb] & ((1ULL << hi_slice_bits) - 1)) : 0;
191 return static_cast<uint32_t
>(lo_slice | (hi_slice << lo_slice_bits));
197 auto compute_cost = [&](uint32_t bits) {
199 size_t buckets =
size_t{ 1 } << bits;
200 return rounds * (num_points + buckets * BUCKET_ACCUMULATION_COST);
203 uint32_t best_bits = 1;
204 size_t best_cost = compute_cost(1);
205 for (uint32_t bits = 2; bits < MAX_SLICE_BITS; ++bits) {
206 size_t cost = compute_cost(bits);
207 if (cost < best_cost) {
217 if (num_points < AFFINE_TRICK_THRESHOLD) {
229 constexpr size_t COST_OF_INVERSION = NUM_BITS_IN_FIELD + ((NUM_BITS_IN_FIELD + 3) / 4) + INVERSION_TABLE_COST;
231 double log2_num_points = log2(
static_cast<double>(num_points));
232 size_t savings_per_round = (num_points * AFFINE_TRICK_SAVINGS_PER_OP) + (num_buckets * JACOBIAN_Z_NOT_ONE_PENALTY);
233 double inversion_cost_per_round = log2_num_points *
static_cast<double>(COST_OF_INVERSION);
235 return static_cast<double>(savings_per_round) > inversion_cost_per_round;
238template <
typename Curve>
240 const size_t num_points,
248 const size_t num_pairs = num_points / 2;
249 bb::group_elements::batch_affine_add_impl<bb::group_elements::InterleavedArrayPolicy, AffineElement, BaseField>(
250 points, points, num_pairs, scratch_space);
253template <
typename Curve>
256 const size_t size = msm_data.scalar_indices.size();
257 const uint32_t bits_per_slice = get_optimal_log_num_buckets(size);
258 const size_t num_buckets =
size_t{ 1 } << bits_per_slice;
259 const uint32_t num_rounds =
static_cast<uint32_t
>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
260 const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
263 Element msm_result = Curve::Group::point_at_infinity;
265 for (uint32_t round = 0; round < num_rounds; ++round) {
267 for (
size_t i = 0; i < size; ++i) {
268 uint32_t idx = msm_data.scalar_indices[i];
269 uint32_t bucket = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
272 bucket_data.
buckets[bucket] += msm_data.points[idx];
274 bucket_data.
buckets[bucket] = msm_data.points[idx];
281 Element bucket_result = accumulate_buckets(bucket_data);
284 uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
285 for (uint32_t i = 0; i < num_doublings; ++i) {
286 msm_result.self_dbl();
288 msm_result += bucket_result;
293template <
typename Curve>
296 const size_t num_points = msm_data.scalar_indices.size();
297 const uint32_t bits_per_slice = get_optimal_log_num_buckets(num_points);
298 const size_t num_buckets =
size_t{ 1 } << bits_per_slice;
300 if (!use_affine_trick(num_points, num_buckets)) {
301 return jacobian_pippenger_with_transformed_scalars(msm_data);
304 const uint32_t num_rounds =
static_cast<uint32_t
>((NUM_BITS_IN_FIELD + bits_per_slice - 1) / bits_per_slice);
305 const uint32_t remainder = NUM_BITS_IN_FIELD % bits_per_slice;
311 Element msm_result = Curve::Group::point_at_infinity;
313 for (uint32_t round = 0; round < num_rounds; ++round) {
316 for (
size_t i = 0; i < num_points; ++i) {
317 uint32_t idx = msm_data.scalar_indices[i];
318 uint32_t bucket_idx = get_scalar_slice(msm_data.scalars[idx], round, bits_per_slice);
319 msm_data.point_schedule[i] = PointScheduleEntry::create(idx, bucket_idx).data;
324 size_t num_zero_bucket_entries =
326 size_t round_size = num_points - num_zero_bucket_entries;
329 Element bucket_result = Curve::Group::point_at_infinity;
330 if (round_size > 0) {
331 std::span<uint64_t> schedule(&msm_data.point_schedule[num_zero_bucket_entries], round_size);
332 batch_accumulate_points_into_buckets(schedule, msm_data.points, affine_data, bucket_data);
333 bucket_result = accumulate_buckets(bucket_data);
338 uint32_t num_doublings = (round == num_rounds - 1 && remainder != 0) ? remainder : bits_per_slice;
339 for (uint32_t i = 0; i < num_doublings; ++i) {
340 msm_result.self_dbl();
342 msm_result += bucket_result;
348template <
typename Curve>
355 if (point_schedule.empty()) {
360 size_t scratch_it = 0;
361 const size_t num_points = point_schedule.size();
362 const size_t prefetch_max = (num_points >= PREFETCH_LOOKAHEAD) ? (num_points - PREFETCH_LOOKAHEAD) : 0;
363 const size_t last_index = num_points - 1;
366 while (point_it < num_points || scratch_it != 0) {
368 while (((scratch_it + 1) < AffineAdditionData::BATCH_SIZE) && (point_it < last_index)) {
370 if ((point_it < prefetch_max) && ((point_it & PREFETCH_INTERVAL_MASK) == 0)) {
371 for (
size_t i = PREFETCH_LOOKAHEAD / 2; i < PREFETCH_LOOKAHEAD; ++i) {
373 __builtin_prefetch(&points[entry.point_index()]);
380 process_bucket_pair(lhs.bucket_index(),
382 &points[lhs.point_index()],
383 &points[rhs.point_index()],
391 if (point_it == last_index) {
393 process_single_point(
394 last.bucket_index(), &points[last.point_index()], affine_data, bucket_data, scratch_it, point_it);
398 size_t num_points_to_add = scratch_it;
399 if (num_points_to_add >= 2) {
401 affine_data.points_to_add.data(), num_points_to_add, affine_data.inversion_scratch_space.data());
405 AffineElement* affine_output = affine_data.points_to_add.data() + (num_points_to_add / 2);
408 size_t new_scratch_it = 0;
409 size_t output_it = 0;
410 size_t num_outputs = num_points_to_add / 2;
412 while ((num_outputs > 1) && (output_it + 1 < num_outputs)) {
413 uint32_t lhs_bucket = affine_data.addition_result_bucket_destinations[output_it];
414 uint32_t rhs_bucket = affine_data.addition_result_bucket_destinations[output_it + 1];
416 process_bucket_pair(lhs_bucket,
418 &affine_output[output_it],
419 &affine_output[output_it + 1],
427 if (num_outputs > 0 && output_it == num_outputs - 1) {
428 uint32_t bucket = affine_data.addition_result_bucket_destinations[output_it];
429 process_single_point(
430 bucket, &affine_output[output_it], affine_data, bucket_data, new_scratch_it, output_it);
434 scratch_it = new_scratch_it;
438template <
typename Curve>
442 bool handle_edge_cases)
noexcept
445 const size_t num_msms = points.size();
455 auto pippenger_impl =
456 handle_edge_cases ? jacobian_pippenger_with_transformed_scalars : affine_pippenger_with_transformed_scalars;
460 if (!thread_work_units[thread_idx].empty()) {
463 msm_results.reserve(msms.size());
466 std::vector<uint64_t> point_schedule_buffer;
469 point_schedule_buffer.resize(msm.size);
471 MSMData::from_work_unit(scalars, points, msm_scalar_indices, point_schedule_buffer, msm);
473 (msm.size < PIPPENGER_THRESHOLD) ? small_mul<Curve>(msm_data) : pippenger_impl(msm_data);
475 msm_results.emplace_back(msm_result, msm.batch_msm_index);
483 std::vector<Element> results(num_msms, Curve::Group::point_at_infinity);
484 for (
const auto& single_thread_msm_results : thread_msm_results) {
485 for (
const auto& [element,
index] : single_thread_msm_results) {
486 results[
index] += element;
489 Element::batch_normalize(results.data(), num_msms);
492 for (
auto& scalar_span : scalars) {
494 for (size_t i = start; i < end; ++i) {
495 scalar_span[i].self_to_montgomery_form();
500 return std::vector<AffineElement>(results.begin(), results.end());
503template <
typename Curve>
506 bool handle_edge_cases)
noexcept
508 if (scalars.size() == 0) {
509 return Curve::Group::affine_point_at_infinity;
511 const size_t num_scalars = scalars.size();
512 BB_ASSERT_GTE(points.size(), scalars.start_index + num_scalars);
524 auto results = batch_multi_scalar_mul(std::span(points_batch), std::span(scalars_batch), handle_edge_cases);
528template <
typename Curve>
531 [[maybe_unused]]
bool handle_edge_cases)
noexcept
536template <
typename Curve>
545 bool handle_edge_cases =
true) noexcept;
547template curve::Grumpkin::
Element pippenger_unsafe<curve::Grumpkin>(
548 PolynomialSpan<const curve::Grumpkin::ScalarField> scalars,
std::span<const curve::Grumpkin::AffineElement> points);
551 std::span<const curve::BN254::AffineElement> points,
552 bool handle_edge_cases = true);
554template curve::BN254::
Element pippenger_unsafe<curve::BN254>(
PolynomialSpan<const curve::BN254::ScalarField> scalars,
555 std::span<const curve::BN254::AffineElement> points);
559template class
bb::scalar_multiplication::
MSM<
bb::curve::Grumpkin>;
560template class
bb::scalar_multiplication::
MSM<
bb::curve::BN254>;
#define BB_ASSERT_GTE(left, right,...)
#define BB_ASSERT_DEBUG(expression,...)
#define BB_ASSERT_EQ(actual, expected,...)
#define BB_ASSERT_LT(left, right,...)
BB_INLINE bool get(size_t index) const noexcept
BB_INLINE void set(size_t index, bool value) noexcept
typename Group::element Element
typename Group::affine_element AffineElement
typename Curve::BaseField BaseField
static bool use_affine_trick(size_t num_points, size_t num_buckets) noexcept
Decide if batch inversion saves work vs Jacobian additions.
static Element jacobian_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using Jacobian buckets (handles edge cases: doubling, infinity)
static uint32_t get_scalar_slice(const ScalarField &scalar, size_t round, size_t slice_size) noexcept
Extract c-bit slice from scalar for bucket index computation.
static Element affine_pippenger_with_transformed_scalars(MSMData &msm_data) noexcept
Pippenger using affine buckets with batch inversion (faster, no edge case handling)
static void add_affine_points(AffineElement *points, const size_t num_points, typename Curve::BaseField *scratch_space) noexcept
Batch add n/2 independent point pairs using Montgomery's trick.
static std::vector< ThreadWorkUnits > get_work_units(std::span< std::span< ScalarField > > scalars, std::vector< std::vector< uint32_t > > &msm_scalar_indices) noexcept
Distribute multiple MSMs across threads with balanced point counts.
typename Curve::Element Element
static uint32_t get_optimal_log_num_buckets(size_t num_points) noexcept
Compute optimal bits per slice by minimizing cost over c in [1, MAX_SLICE_BITS)
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< std::span< const AffineElement > > points, std::span< std::span< ScalarField > > scalars, bool handle_edge_cases=true) noexcept
Compute multiple MSMs in parallel with work balancing.
static void batch_accumulate_points_into_buckets(std::span< const uint64_t > point_schedule, std::span< const AffineElement > points, AffineAdditionData &affine_data, BucketAccumulators &bucket_data) noexcept
Process sorted point schedule into bucket accumulators using batched affine additions.
typename Curve::ScalarField ScalarField
typename Curve::AffineElement AffineElement
static void transform_scalar_and_get_nonzero_scalar_indices(std::span< ScalarField > scalars, std::vector< uint32_t > &nonzero_scalar_indices) noexcept
Convert scalars from Montgomery form and collect indices of nonzero scalars.
constexpr T ceil_div(const T &numerator, const T &denominator)
Computes the ceiling of the division of two integral types.
Curve::Element small_mul(const typename MSM< Curve >::MSMData &msm_data) noexcept
Curve::Element pippenger(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases) noexcept
Safe MSM wrapper (defaults to handle_edge_cases=true)
size_t sort_point_schedule_and_count_zero_buckets(uint64_t *point_schedule, const size_t num_entries, const uint32_t bucket_index_bits) noexcept
Sort point schedule by bucket index and count zero-bucket entries.
Curve::Element pippenger_unsafe(PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points) noexcept
Fast MSM wrapper for linearly independent points (no edge case handling)
Entry point for Barretenberg command-line interface.
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
auto range(size_t size, size_t offset=0) const
Scratch space for batched affine point additions (one per thread)
Affine bucket accumulators for the fast affine-trick Pippenger variant.
Jacobian bucket accumulators for the safe Pippenger variant.
std::vector< Element > buckets
Container for MSM input data passed between algorithm stages.
MSMWorkUnit describes an MSM that may be part of a larger MSM.
Packed point schedule entry: (point_index << 32) | bucket_index.