SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
interleaved_bloom_filter.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <algorithm>
16 #include <bit>
17 
18 #include <sdsl/bit_vectors.hpp>
19 
22 
23 namespace seqan3
24 {
27 enum data_layout : bool
28 {
30  compressed
31 };
32 
35 struct bin_count : public detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>
36 {
37  using detail::strong_type<size_t, bin_count, detail::strong_type_skill::convert>::strong_type;
38 };
39 
42 struct bin_size : public detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>
43 {
44  using detail::strong_type<size_t, bin_size, detail::strong_type_skill::convert>::strong_type;
45 };
46 
49 struct hash_function_count : public detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>
50 {
51  using detail::strong_type<size_t, hash_function_count, detail::strong_type_skill::convert>::strong_type;
52 };
53 
56 struct bin_index : public detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>
57 {
58  using detail::strong_type<size_t, bin_index, detail::strong_type_skill::convert>::strong_type;
59 };
60 
132 template <data_layout data_layout_mode_ = data_layout::uncompressed>
134 {
135 private:
137  template <data_layout data_layout_mode>
138  friend class interleaved_bloom_filter;
140 
142  using data_type =
144 
146  size_t bins{};
148  size_t technical_bins{};
150  size_t bin_size_{};
152  size_t hash_shift{};
154  size_t bin_words{};
156  size_t hash_funs{};
158  data_type data{};
160  static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
161  13043817825332782213ULL, // 2**64 / sqrt(2)
162  10650232656628343401ULL, // 2**64 / sqrt(3)
163  16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
164  4893150838803335377ULL}; // 2**64 / (3*pi/5)
165 
173  inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
174  {
175  h *= seed;
176  assert(hash_shift < 64);
177  h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
178  h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
179  // Use fastrange (integer modulo without division) if possible.
180 #ifdef __SIZEOF_INT128__
181  h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(bin_size_)) >> 64);
182 #else
183  h %= bin_size_;
184 #endif
185  h *= technical_bins;
186  return h;
187  }
188 
189 public:
191  static constexpr data_layout data_layout_mode = data_layout_mode_;
192 
193  class membership_agent_type; // documented upon definition below
194 
195  template <std::integral value_t>
196  class counting_agent_type; // documented upon definition below
197 
207 
225  {
226  bins = bins_.get();
227  bin_size_ = size.get();
228  hash_funs = funs.get();
229 
230  if (bins == 0)
231  throw std::logic_error{"The number of bins must be > 0."};
232  if (hash_funs == 0 || hash_funs > 5)
233  throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
234  if (bin_size_ == 0)
235  throw std::logic_error{"The size of a bin must be > 0."};
236 
237  hash_shift = std::countl_zero(bin_size_);
238  bin_words = (bins + 63) >> 6; // = ceil(bins/64)
239  technical_bins = bin_words << 6; // = bin_words * 64
240  data = sdsl::bit_vector(technical_bins * bin_size_);
241  }
242 
254  interleaved_bloom_filter(interleaved_bloom_filter<data_layout::uncompressed> const & ibf)
256  {
257  std::tie(bins, technical_bins, bin_size_, hash_shift, bin_words, hash_funs) =
258  std::tie(ibf.bins, ibf.technical_bins, ibf.bin_size_, ibf.hash_shift, ibf.bin_words, ibf.hash_funs);
259 
260  data = sdsl::sd_vector<>{ibf.data};
261  }
263 
279  void emplace(size_t const value, bin_index const bin) noexcept
281  {
282  assert(bin.get() < bins);
283  for (size_t i = 0; i < hash_funs; ++i)
284  {
285  size_t idx = hash_and_fit(value, hash_seeds[i]);
286  idx += bin.get();
287  assert(idx < data.size());
288  data[idx] = 1;
289  };
290  }
291 
303  void clear(bin_index const bin) noexcept
305  {
306  assert(bin.get() < bins);
307  for (size_t idx = bin.get(), i = 0; i < bin_size_; idx += technical_bins, ++i)
308  data[idx] = 0;
309  }
310 
324  template <typename rng_t>
326  void clear(rng_t && bin_range) noexcept
327  {
328  static_assert(std::ranges::forward_range<rng_t>, "The range of bins to clear must model a forward_range.");
329  static_assert(std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<rng_t>>, bin_index>,
330  "The reference type of the range to clear must be seqan3::bin_index.");
331 #ifndef NDEBUG
332  for (auto && bin : bin_range)
333  assert(bin.get() < bins);
334 #endif // NDEBUG
335 
336  for (size_t offset = 0, i = 0; i < bin_size_; offset += technical_bins, ++i)
337  for (auto && bin : bin_range)
338  data[bin.get() + offset] = 0;
339  }
340 
364  void increase_bin_number_to(bin_count const new_bins_)
366  {
367  size_t new_bins = new_bins_.get();
368 
369  if (new_bins < bins)
370  throw std::invalid_argument{"The number of new bins must be >= the current number of bins."};
371 
372  // Equivalent to ceil(new_bins / 64)
373  size_t new_bin_words = (new_bins + 63) >> 6;
374 
375  bins = new_bins;
376 
377  if (new_bin_words == bin_words) // No need for internal resize if bin_words does not change.
378  return;
379 
380  size_t new_technical_bins = new_bin_words << 6;
381  size_t new_bits = bin_size_ * new_technical_bins;
382 
383  size_t idx_{new_bits}, idx{data.size()};
384  size_t delta = new_technical_bins - technical_bins + 64;
385 
386  data.resize(new_bits);
387 
388  for (size_t i = idx_, j = idx; j > 0; i -= new_technical_bins, j -= technical_bins)
389  {
390  size_t stop = i - new_technical_bins;
391 
392  for (size_t ii = i - delta, jj = j - 64; stop && ii >= stop; ii -= 64, jj -= 64)
393  {
394  uint64_t old = data.get_int(jj);
395  data.set_int(jj, 0);
396  data.set_int(ii, old);
397  }
398  }
399 
400  bin_words = new_bin_words;
401  technical_bins = new_technical_bins;
402  }
404 
419  membership_agent_type membership_agent() const
420  {
421  return membership_agent_type{*this};
422  }
423 
435  template <typename value_t = uint16_t>
436  counting_agent_type<value_t> counting_agent() const
437  {
438  return counting_agent_type<value_t>{*this};
439  }
441 
448  size_t hash_function_count() const noexcept
449  {
450  return hash_funs;
451  }
452 
456  size_t bin_count() const noexcept
457  {
458  return bins;
459  }
460 
464  size_t bin_size() const noexcept
465  {
466  return bin_size_;
467  }
468 
472  size_t bit_size() const noexcept
473  {
474  return data.size();
475  }
477 
486  friend bool operator==(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
487  {
488  return std::tie(lhs.bins,
489  lhs.technical_bins,
490  lhs.bin_size_,
491  lhs.hash_shift,
492  lhs.bin_words,
493  lhs.hash_funs,
494  lhs.data)
495  == std::tie(rhs.bins,
496  rhs.technical_bins,
497  rhs.bin_size_,
498  rhs.hash_shift,
499  rhs.bin_words,
500  rhs.hash_funs,
501  rhs.data);
502  }
503 
509  friend bool operator!=(interleaved_bloom_filter const & lhs, interleaved_bloom_filter const & rhs) noexcept
510  {
511  return !(lhs == rhs);
512  }
514 
525  constexpr data_type & raw_data() noexcept
526  {
527  return data;
528  }
529 
531  constexpr data_type const & raw_data() const noexcept
532  {
533  return data;
534  }
536 
544  template <cereal_archive archive_t>
545  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
546  {
547  archive(bins);
548  archive(technical_bins);
549  archive(bin_size_);
550  archive(hash_shift);
551  archive(bin_words);
552  archive(hash_funs);
553  archive(data);
554  }
556 };
557 
568 template <data_layout data_layout_mode>
570 {
571 private:
574 
576  ibf_t const * ibf_ptr{nullptr};
577 
578 public:
579  class binning_bitvector;
580 
584  membership_agent_type() = default;
590 
595  explicit membership_agent_type(ibf_t const & ibf) : ibf_ptr(std::addressof(ibf)), result_buffer(ibf.bin_count())
596  {}
598 
601 
622  [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) & noexcept
623  {
624  assert(ibf_ptr != nullptr);
625  assert(result_buffer.size() == ibf_ptr->bin_count());
626 
627  std::array<size_t, 5> bloom_filter_indices;
628  std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
629 
630  for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
631  bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
632 
633  for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
634  {
635  size_t tmp{-1ULL};
636  for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
637  {
638  assert(bloom_filter_indices[i] < ibf_ptr->data.size());
639  tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
640  bloom_filter_indices[i] += 64;
641  }
642 
643  result_buffer.data.set_int(batch << 6, tmp);
644  }
645 
646  return result_buffer;
647  }
648 
649  // `bulk_contains` cannot be called on a temporary, since the object the returned reference points to
650  // is immediately destroyed.
651  [[nodiscard]] binning_bitvector const & bulk_contains(size_t const value) && noexcept = delete;
653 };
654 
656 template <data_layout data_layout_mode>
658 {
659 private:
661  using data_type = sdsl::bit_vector;
663  data_type data{};
664 
665  friend class membership_agent_type;
666 
667 public:
671  binning_bitvector() = default;
676  ~binning_bitvector() = default;
677 
679  explicit binning_bitvector(size_t const size) : data(size)
680  {}
682 
684  size_t size() const noexcept
685  {
686  return data.size();
687  }
688 
693  auto begin() noexcept
694  {
695  return data.begin();
696  }
697 
699  auto begin() const noexcept
700  {
701  return data.begin();
702  }
703 
705  auto end() noexcept
706  {
707  return data.end();
708  }
709 
711  auto end() const noexcept
712  {
713  return data.end();
714  }
716 
721  friend bool operator==(binning_bitvector const & lhs, binning_bitvector const & rhs) noexcept
722  {
723  return lhs.data == rhs.data;
724  }
725 
727  friend bool operator!=(binning_bitvector const & lhs, binning_bitvector const & rhs) noexcept
728  {
729  return !(lhs == rhs);
730  }
732 
737  auto operator[](size_t const i) noexcept
738  {
739  assert(i < size());
740  return data[i];
741  }
742 
744  auto operator[](size_t const i) const noexcept
745  {
746  assert(i < size());
747  return data[i];
748  }
749 
757  constexpr data_type & raw_data() noexcept
758  {
759  return data;
760  }
761 
763  constexpr data_type const & raw_data() const noexcept
764  {
765  return data;
766  }
768 };
769 
793 template <std::integral value_t>
794 class counting_vector : public std::vector<value_t>
795 {
796 private:
799 
801  template <typename binning_bitvector_t>
802  static constexpr bool is_binning_bitvector =
803  std::same_as<binning_bitvector_t,
805  || std::same_as<binning_bitvector_t,
807 
808 public:
812  counting_vector() = default;
813  counting_vector(counting_vector const &) = default;
817  ~counting_vector() = default;
818 
819  using base_t::base_t;
821 
834  template <typename binning_bitvector_t>
835  requires is_binning_bitvector<binning_bitvector_t>
836  counting_vector & operator+=(binning_bitvector_t const & binning_bitvector)
837  {
838  for_each_set_bin(binning_bitvector,
839  [this](size_t const bin)
840  {
841  ++(*this)[bin];
842  });
843  return *this;
844  }
845 
853  template <typename binning_bitvector_t>
854  requires is_binning_bitvector<binning_bitvector_t>
855  counting_vector & operator-=(binning_bitvector_t const & binning_bitvector)
856  {
857  for_each_set_bin(binning_bitvector,
858  [this](size_t const bin)
859  {
860  assert((*this)[bin] > 0);
861  --(*this)[bin];
862  });
863  return *this;
864  }
865 
877  {
878  assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need.
879 
880  std::transform(this->begin(), this->end(), rhs.begin(), this->begin(), std::plus<value_t>());
881 
882  return *this;
883  }
884 
890  {
891  assert(this->size() >= rhs.size()); // The counting vector may be bigger than what we need.
892 
893  std::transform(this->begin(),
894  this->end(),
895  rhs.begin(),
896  this->begin(),
897  [](auto a, auto b)
898  {
899  assert(a >= b);
900  return a - b;
901  });
902 
903  return *this;
904  }
905 
906 private:
908  template <typename binning_bitvector_t, typename on_bin_fn_t>
909  void for_each_set_bin(binning_bitvector_t && binning_bitvector, on_bin_fn_t && on_bin_fn)
910  {
911  assert(this->size() >= binning_bitvector.size()); // The counting vector may be bigger than what we need.
912 
913  // Jump to the next 1 and return the number of places jumped in the bit_sequence
914  auto jump_to_next_1bit = [](size_t & x)
915  {
916  auto const zeros = std::countr_zero(x);
917  x >>= zeros; // skip number of zeros
918  return zeros;
919  };
920 
921  // Each iteration can handle 64 bits
922  for (size_t bit_pos = 0; bit_pos < binning_bitvector.size(); bit_pos += 64)
923  {
924  // get 64 bits starting at position `bit_pos`
925  size_t bit_sequence = binning_bitvector.raw_data().get_int(bit_pos);
926 
927  // process each relative bin inside the bit_sequence
928  for (size_t bin = bit_pos; bit_sequence != 0u; ++bin, bit_sequence >>= 1)
929  {
930  // Jump to the next 1 and
931  bin += jump_to_next_1bit(bit_sequence);
932 
933  on_bin_fn(bin);
934  }
935  }
936  }
937 };
938 
948 template <data_layout data_layout_mode>
949 template <std::integral value_t>
951 {
952 private:
955 
957  ibf_t const * ibf_ptr{nullptr};
958 
960  membership_agent_type membership_agent;
961 
962 public:
966  counting_agent_type() = default;
971  ~counting_agent_type() = default;
972 
977  explicit counting_agent_type(ibf_t const & ibf) :
978  ibf_ptr(std::addressof(ibf)),
979  membership_agent(ibf),
980  result_buffer(ibf.bin_count())
981  {}
983 
986 
1009  template <std::ranges::range value_range_t>
1010  [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) & noexcept
1011  {
1012  assert(ibf_ptr != nullptr);
1013  assert(result_buffer.size() == ibf_ptr->bin_count());
1014 
1015  static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
1016  static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
1017  "An individual value must be an unsigned integral.");
1018 
1019  std::ranges::fill(result_buffer, 0);
1020 
1021  for (auto && value : values)
1022  result_buffer += membership_agent.bulk_contains(value);
1023 
1024  return result_buffer;
1025  }
1026 
1027  // `bulk_count` cannot be called on a temporary, since the object the returned reference points to
1028  // is immediately destroyed.
1029  template <std::ranges::range value_range_t>
1030  [[nodiscard]] counting_vector<value_t> const & bulk_count(value_range_t && values) && noexcept = delete;
1032 };
1033 
1034 } // namespace seqan3
The <bit> header from C++20's standard library.
Adaptions of concepts from the Cereal library.
A data structure that behaves like a std::vector and can be used to consolidate the results of multip...
Definition: interleaved_bloom_filter.hpp:795
requires is_binning_bitvector< binning_bitvector_t > counting_vector & operator-=(binning_bitvector_t const &binning_bitvector)
Bin-wise subtracts the bits of a seqan3::interleaved_bloom_filter::membership_agent_type::binning_bit...
Definition: interleaved_bloom_filter.hpp:855
counting_vector & operator-=(counting_vector const &rhs)
Bin-wise substraction of two seqan3::counting_vectors.
Definition: interleaved_bloom_filter.hpp:889
requires is_binning_bitvector< binning_bitvector_t > counting_vector & operator+=(binning_bitvector_t const &binning_bitvector)
Bin-wise adds the bits of a seqan3::interleaved_bloom_filter::membership_agent_type::binning_bitvecto...
Definition: interleaved_bloom_filter.hpp:836
counting_vector(counting_vector const &)=default
Defaulted.
~counting_vector()=default
Defaulted.
counting_vector(counting_vector &&)=default
Defaulted.
counting_vector & operator=(counting_vector &&)=default
Defaulted.
counting_vector()=default
Defaulted.
counting_vector & operator=(counting_vector const &)=default
Defaulted.
counting_vector & operator+=(counting_vector const &rhs)
Bin-wise addition of two seqan3::counting_vectors.
Definition: interleaved_bloom_filter.hpp:876
Manages counting ranges of values for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:951
counting_agent_type & operator=(counting_agent_type const &)=default
Defaulted.
counting_vector< value_t > const & bulk_count(value_range_t &&values) &noexcept
Counts the occurrences in each bin for all values in a range.
Definition: interleaved_bloom_filter.hpp:1010
counting_agent_type(counting_agent_type &&)=default
Defaulted.
counting_agent_type(counting_agent_type const &)=default
Defaulted.
counting_agent_type & operator=(counting_agent_type &&)=default
Defaulted.
counting_vector< value_t > result_buffer
Stores the result of bulk_count().
Definition: interleaved_bloom_filter.hpp:985
counting_vector< value_t > const & bulk_count(value_range_t &&values) &&noexcept=delete
Counts the occurrences in each bin for all values in a range.
A bitvector representing the result of a call to bulk_contains of the seqan3::interleaved_bloom_filte...
Definition: interleaved_bloom_filter.hpp:658
binning_bitvector & operator=(binning_bitvector &&)=default
Defaulted.
auto end() noexcept
Returns an iterator to the element following the last element of the container.
Definition: interleaved_bloom_filter.hpp:705
binning_bitvector(binning_bitvector const &)=default
Defaulted.
binning_bitvector & operator=(binning_bitvector const &)=default
Defaulted.
auto end() const noexcept
Returns an iterator to the element following the last element of the container.
Definition: interleaved_bloom_filter.hpp:711
constexpr data_type & raw_data() noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:757
constexpr data_type const & raw_data() const noexcept
Provides direct, unsafe access to the underlying data structure.
Definition: interleaved_bloom_filter.hpp:763
auto operator[](size_t const i) const noexcept
Return the i-th element.
Definition: interleaved_bloom_filter.hpp:744
binning_bitvector(size_t const size)
Construct with given size.
Definition: interleaved_bloom_filter.hpp:679
size_t size() const noexcept
Returns the number of elements.
Definition: interleaved_bloom_filter.hpp:684
auto begin() noexcept
Returns an iterator to the first element of the container.
Definition: interleaved_bloom_filter.hpp:693
auto begin() const noexcept
Returns an iterator to the first element of the container.
Definition: interleaved_bloom_filter.hpp:699
auto operator[](size_t const i) noexcept
Return the i-th element.
Definition: interleaved_bloom_filter.hpp:737
friend bool operator==(binning_bitvector const &lhs, binning_bitvector const &rhs) noexcept
Test for equality.
Definition: interleaved_bloom_filter.hpp:721
friend bool operator!=(binning_bitvector const &lhs, binning_bitvector const &rhs) noexcept
Test for inequality.
Definition: interleaved_bloom_filter.hpp:727
Manages membership queries for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:570
membership_agent_type & operator=(membership_agent_type const &)=default
Defaulted.
binning_bitvector const & bulk_contains(size_t const value) &noexcept
Determines set membership of a given value.
Definition: interleaved_bloom_filter.hpp:622
membership_agent_type & operator=(membership_agent_type &&)=default
Defaulted.
binning_bitvector const & bulk_contains(size_t const value) &&noexcept=delete
Determines set membership of a given value.
membership_agent_type(membership_agent_type &&)=default
Defaulted.
membership_agent_type(membership_agent_type const &)=default
Defaulted.
binning_bitvector result_buffer
Stores the result of bulk_contains().
Definition: interleaved_bloom_filter.hpp:600
The IBF binning directory. A data structure that efficiently answers set-membership queries for multi...
Definition: interleaved_bloom_filter.hpp:134
interleaved_bloom_filter & operator=(interleaved_bloom_filter const &)=default
Defaulted.
interleaved_bloom_filter()=default
Defaulted.
interleaved_bloom_filter(interleaved_bloom_filter &&)=default
Defaulted.
interleaved_bloom_filter(interleaved_bloom_filter const &)=default
Defaulted.
~interleaved_bloom_filter()=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:191
interleaved_bloom_filter & operator=(interleaved_bloom_filter &&)=default
Defaulted.
T data(T... args)
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:28
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition: interleaved_bloom_filter.hpp:29
@ compressed
The Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:30
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
T memcpy(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
T operator!=(T... args)
Provides basic data structure for strong types.
A strong type that represents the number of bins for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:36
A strong type that represents the bin index for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:57
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition: interleaved_bloom_filter.hpp:43
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:50
strong_type for seed.
Definition: minimiser_hash.hpp:25
T tie(T... args)
T transform(T... args)