SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
bloom_filter.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
16 
17 namespace seqan3
18 {
19 
83 template <data_layout data_layout_mode_ = data_layout::uncompressed>
85 {
86 private:
88  template <data_layout data_layout_mode>
89  friend class bloom_filter;
91 
93  using data_type =
95 
97  size_t size_in_bits{};
99  size_t hash_shift{};
101  size_t hash_funs{};
103  data_type data{};
105  static constexpr std::array<size_t, 5> hash_seeds{13572355802537770549ULL, // 2**64 / (e/2)
106  13043817825332782213ULL, // 2**64 / sqrt(2)
107  10650232656628343401ULL, // 2**64 / sqrt(3)
108  16499269484942379435ULL, // 2**64 / (sqrt(5)/2)
109  4893150838803335377ULL}; // 2**64 / (3*pi/5)
110 
118  inline constexpr size_t hash_and_fit(size_t h, size_t const seed) const
119  {
120  h *= seed;
121  h ^= h >> hash_shift; // XOR and shift higher bits into lower bits
122  h *= 11400714819323198485ULL; // = 2^64 / golden_ration, to expand h to 64 bit range
123  // Use fastrange (integer modulo without division) if possible.
124 #ifdef __SIZEOF_INT128__
125  h = static_cast<uint64_t>((static_cast<__uint128_t>(h) * static_cast<__uint128_t>(size_in_bits)) >> 64);
126 #else
127  h %= size_in_bits;
128 #endif
129  return h;
130  }
131 
132 public:
134  static constexpr data_layout data_layout_mode = data_layout_mode_;
135 
139  bloom_filter() = default;
140  bloom_filter(bloom_filter const &) = default;
141  bloom_filter & operator=(bloom_filter const &) = default;
142  bloom_filter(bloom_filter &&) = default;
144  ~bloom_filter() = default;
145 
160  {
161  size_in_bits = size.get();
162  hash_funs = funs.get();
163 
164  if (hash_funs == 0 || hash_funs > 5)
165  throw std::logic_error{"The number of hash functions must be > 0 and <= 5."};
166  if (size_in_bits == 0)
167  throw std::logic_error{"The size of a bloom filter must be > 0."};
168 
169  hash_shift = std::countl_zero(size_in_bits);
170  data = sdsl::bit_vector(size_in_bits);
171  }
172 
184  bloom_filter(bloom_filter<data_layout::uncompressed> const & bf)
186  {
187  std::tie(size_in_bits, hash_shift, hash_funs) = std::tie(bf.size_in_bits, bf.hash_shift, bf.hash_funs);
188 
189  data = sdsl::sd_vector<>{bf.data};
190  }
192 
207  void emplace(size_t const value) noexcept
209  {
210  for (size_t i = 0; i < hash_funs; ++i)
211  {
212  size_t idx = hash_and_fit(value, hash_seeds[i]);
213  assert(idx < data.size());
214  data[idx] = 1;
215  };
216  }
217 
230  void reset() noexcept
232  {
233  sdsl::util::_set_zero_bits(data);
234  }
236 
251  bool contains(size_t const value) const noexcept
252  {
253  for (size_t i = 0; i < hash_funs; i++)
254  {
255  size_t idx = hash_and_fit(value, hash_seeds[i]);
256  assert(idx < data.size());
257  if (data[idx] == 0)
258  return false;
259  }
260  return true;
261  }
263 
282  template <std::ranges::range value_range_t>
283  size_t count(value_range_t && values) const noexcept
284  {
285  static_assert(std::ranges::input_range<value_range_t>, "The values must model input_range.");
286  static_assert(std::unsigned_integral<std::ranges::range_value_t<value_range_t>>,
287  "An individual value must be an unsigned integral.");
288 
289  size_t result = 0;
290 
291  for (auto && value : values)
292  result += contains(value);
293 
294  return result;
295  }
297 
304  size_t hash_function_count() const noexcept
305  {
306  return hash_funs;
307  }
308 
312  size_t bit_size() const noexcept
313  {
314  return size_in_bits;
315  }
317 
326  friend bool operator==(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
327  {
328  return std::tie(lhs.size_in_bits, lhs.hash_shift, lhs.hash_funs, lhs.data)
329  == std::tie(rhs.size_in_bits, rhs.hash_shift, rhs.hash_funs, rhs.data);
330  }
331 
337  friend bool operator!=(bloom_filter const & lhs, bloom_filter const & rhs) noexcept
338  {
339  return !(lhs == rhs);
340  }
342 
353  constexpr data_type & raw_data() noexcept
354  {
355  return data;
356  }
357 
359  constexpr data_type const & raw_data() const noexcept
360  {
361  return data;
362  }
364 
372  template <cereal_archive archive_t>
373  void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
374  {
375  archive(size_in_bits);
376  archive(hash_shift);
377  archive(hash_funs);
378  archive(data);
379  }
381 };
382 
383 } // namespace seqan3
The Bloom Filter. A data structure that efficiently answers set-membership queries.
Definition: bloom_filter.hpp:85
bloom_filter(bloom_filter &&)=default
Defaulted.
static constexpr data_layout data_layout_mode
Indicates whether the Bloom Filter is compressed.
Definition: bloom_filter.hpp:134
bloom_filter & operator=(bloom_filter &&)=default
Defaulted.
bloom_filter(bloom_filter const &)=default
Defaulted.
bloom_filter()=default
Defaulted.
bloom_filter & operator=(bloom_filter const &)=default
Defaulted.
~bloom_filter()=default
Defaulted.
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
data_layout
Determines if the Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:28
@ uncompressed
The Interleaved Bloom Filter is uncompressed.
Definition: interleaved_bloom_filter.hpp:29
@ compressed
The Interleaved Bloom Filter is compressed.
Definition: interleaved_bloom_filter.hpp:30
requires constexpr seqan3::detail::template_specialisation_of< list_t, seqan3::type_list > bool contains
Whether a type occurs in a type list or not.
Definition: type_list/traits.hpp:252
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
Provides seqan3::interleaved_bloom_filter.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T operator!=(T... args)
A strong type that represents the number of bits for each bin in the seqan3::interleaved_bloom_filter...
Definition: interleaved_bloom_filter.hpp:43
A strong type that represents the number of hash functions for the seqan3::interleaved_bloom_filter.
Definition: interleaved_bloom_filter.hpp:50
strong_type for seed.
Definition: minimiser_hash.hpp:25
T tie(T... args)