SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
search_scheme_algorithm.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <type_traits>
16 
24 
25 namespace seqan3::detail
26 {
27 
34 template <typename configuration_t, typename index_t, typename... policies_t>
35  requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
36 class search_scheme_algorithm : protected policies_t...
37 {
38 private:
40  using traits_t = search_traits<configuration_t>;
42  using search_result_type = typename traits_t::search_result_type;
43 
44  static_assert(!std::same_as<search_result_type, empty_type>, "The search result type was not configured.");
45 
46 public:
50  search_scheme_algorithm() = default;
51  search_scheme_algorithm(search_scheme_algorithm const &) = default;
52  search_scheme_algorithm(search_scheme_algorithm &&) = default;
53  search_scheme_algorithm & operator=(search_scheme_algorithm const &) = default;
54  search_scheme_algorithm & operator=(search_scheme_algorithm &&) = default;
55  ~search_scheme_algorithm() = default;
56 
68  search_scheme_algorithm(configuration_t const & cfg, index_t const & index) : policies_t{cfg}...
69  {
70  stratum = cfg.get_or(search_cfg::hit_strata{0}).stratum;
71  index_ptr = std::addressof(index);
72  }
74 
95  template <tuple_like indexed_query_t, typename callback_t>
96  requires (std::tuple_size_v<indexed_query_t> == 2)
97  && std::ranges::forward_range<std::tuple_element_t<1, indexed_query_t>>
98  && std::invocable<callback_t, search_result_type>
99  void operator()(indexed_query_t && indexed_query, callback_t && callback)
100  {
101  auto && [query_idx, query] = indexed_query;
102  auto error_state = this->max_error_counts(query); // see policy_max_error
103 
104  // construct internal delegate for collecting hits for later filtering (if necessary)
106  auto on_hit_delegate = [&internal_hits](auto const & it)
107  {
108  internal_hits.push_back(it);
109  };
110 
111  perform_search_by_hit_strategy(internal_hits, query, error_state, on_hit_delegate);
112 
113  // Invoke the callback on the generated result.
114  this->make_results(std::move(internal_hits), query_idx, callback); // see policy_search_result_builder
115  }
116 
117 private:
119  index_t const * index_ptr{nullptr};
120 
122  uint8_t stratum{};
123 
124  // forward declaration
125  template <bool abort_on_hit, typename query_t, typename delegate_t>
126  inline void search_algo_bi(query_t & query, search_param const error_left, delegate_t && delegate);
127 
135  template <typename query_t, typename delegate_t>
136  void perform_search_by_hit_strategy(std::vector<typename index_t::cursor_type> & internal_hits,
137  query_t & query,
138  search_param error_state,
139  delegate_t const & on_hit_delegate)
140  {
141  if constexpr (!traits_t::search_all_hits)
142  {
143  auto max_total = error_state.total;
144  error_state.total = 0; // start search with less errors
145  while (internal_hits.empty() && error_state.total <= max_total)
146  {
147  // * If you only want the best hit (traits_t::search_single_best_hit), you stop after finding the
148  // first hit, the hit with the least errors (`abort_on_hit` is true).
149  // * If you are in strata mode (traits_t::search_strata_hits), you do the same as with best hits,
150  // but then do the extra step afterwards (`abort_on_hit` is true).
151  // * If you want all best hits (traits_t::search_all_best_hits), you do not stop after the first
152  // hit but continue the current search algorithm/max_error pattern (`abort_on_hit` is true).
153  constexpr bool abort_on_hit = !traits_t::search_all_best_hits;
154  search_algo_bi<abort_on_hit>(query, error_state, on_hit_delegate);
155  ++error_state.total;
156  }
157  if constexpr (traits_t::search_strata_hits)
158  {
159  if (!internal_hits.empty())
160  {
161  internal_hits.clear(); // TODO:don't clear when using Optimum Search Schemes with lower error bounds
162  error_state.total += stratum - 1;
163  search_algo_bi<false>(query, error_state, on_hit_delegate);
164  }
165  }
166  }
167  else // detail::search_mode_all
168  {
169  // If you want to find all hits, you cannot stop once you found any hit (<false>)
170  // since you have to find all paths in the search tree that satisfy the hit condition.
171  search_algo_bi<false>(query, error_state, on_hit_delegate);
172  }
173  }
174 };
175 
189 inline std::vector<search_dyn> compute_ss(uint8_t const min_error, uint8_t const max_error)
190 {
191  // TODO: Replace this at least by the pigeonhole principle or even better by 01*0 schemes.
192  // NOTE: Make sure that the searches are sorted by their asymptotical running time (i.e. upper error bound string),
193  // s.t. easy to compute searches come first. This improves the running time of algorithms that abort after the
194  // first hit (e.g. search strategy: best). Even though it is not guaranteed, this seems to be a good greedy
195  // approach.
196  std::vector<search_dyn> scheme{{{1}, {min_error}, {max_error}}};
197  return scheme;
198 }
199 
217 template <typename search_scheme_t>
218 inline auto search_scheme_block_info(search_scheme_t const & search_scheme, size_t const query_length)
219 {
220  using blocks_length_type = typename search_scheme_t::value_type::blocks_length_type;
221 
222  constexpr bool is_dyn_scheme = std::same_as<search_scheme_t, search_scheme_dyn_type>;
223 
224  // Either store information in an array (for search schemes known at compile time) or in a vector otherwise.
225  using result_type = std::conditional_t<
226  is_dyn_scheme,
229  transformation_trait_or_t<std::tuple_size<search_scheme_t>, std::false_type>::value>>;
230 
231  result_type result;
232  if constexpr (is_dyn_scheme)
233  result.resize(search_scheme.size());
234 
235  uint8_t const blocks{search_scheme[0].blocks()};
236  size_t const block_length{query_length / blocks};
237  uint8_t const rest{static_cast<uint8_t>(query_length % blocks)};
238 
239  blocks_length_type blocks_length;
240  // set all blocks_length values to block_length
241  // resp. block_length + 1 for the first `rest = block_length % blocks` values
242  if constexpr (is_dyn_scheme)
243  blocks_length.resize(blocks, block_length);
244  else
245  blocks_length.fill(block_length);
246 
247  for (uint8_t block_id = 0; block_id < rest; ++block_id)
248  ++blocks_length[block_id];
249 
250  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
251  {
252  auto const & search = search_scheme[search_id];
253 
254  auto & [search_blocks_length, start_pos] = result[search_id];
255 
256  // compute cumulative blocks_length and starting position
257  start_pos = 0;
258  if constexpr (is_dyn_scheme)
259  search_blocks_length.resize(blocks);
260  search_blocks_length[0] = blocks_length[search.pi[0] - 1];
261  for (uint8_t i = 1; i < blocks; ++i)
262  {
263  search_blocks_length[i] = blocks_length[search.pi[i] - 1] + search_blocks_length[i - 1];
264  if (search.pi[i] < search.pi[0])
265  start_pos += search_blocks_length[i] - search_blocks_length[i - 1];
266  }
267  }
268 
269  return result;
270 }
271 
273 // forward declaration
274 template <bool abort_on_hit,
275  typename cursor_t,
276  typename query_t,
277  typename search_t,
278  typename blocks_length_t,
279  typename delegate_t>
280 inline bool search_ss(cursor_t cur,
281  query_t & query,
282  typename cursor_t::size_type const lb,
283  typename cursor_t::size_type const rb,
284  uint8_t const errors_spent,
285  uint8_t const block_id,
286  bool const go_right,
287  search_t const & search,
288  blocks_length_t const & blocks_length,
289  search_param const error_left,
290  delegate_t && delegate);
292 
324 template <bool abort_on_hit,
325  typename cursor_t,
326  typename query_t,
327  typename search_t,
328  typename blocks_length_t,
329  typename delegate_t>
330 inline bool search_ss_exact(cursor_t cur,
331  query_t & query,
332  typename cursor_t::size_type const lb,
333  typename cursor_t::size_type const rb,
334  uint8_t const errors_spent,
335  uint8_t const block_id,
336  bool const go_right,
337  search_t const & search,
338  blocks_length_t const & blocks_length,
339  search_param const error_left,
340  delegate_t && delegate)
341 {
342  using size_type = typename cursor_t::size_type;
343 
344  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
345  bool const go_right2 = (block_id < search.blocks() - 1) && (search.pi[block_id + 1] > search.pi[block_id]);
346 
347  if (go_right)
348  {
349  size_type const infix_lb = rb - 1; // inclusive
350  size_type const infix_rb = lb + blocks_length[block_id] - 1; // exclusive
351 
352  if (!cur.extend_right(query | views::slice(infix_lb, infix_rb + 1)))
353  return false;
354 
355  if (search_ss<abort_on_hit>(cur,
356  query,
357  lb,
358  infix_rb + 2,
359  errors_spent,
360  block_id2,
361  go_right2,
362  search,
363  blocks_length,
364  error_left,
365  delegate)
366  && abort_on_hit)
367  {
368  return true;
369  }
370  }
371  else
372  {
373  size_type const infix_lb = rb - blocks_length[block_id] - 1; // inclusive
374  size_type const infix_rb = lb - 1; // inclusive
375 
376  if (!cur.extend_left(query | views::slice(infix_lb, infix_rb + 1)))
377  return false;
378 
379  if (search_ss<abort_on_hit>(cur,
380  query,
381  infix_lb,
382  rb,
383  errors_spent,
384  block_id2,
385  go_right2,
386  search,
387  blocks_length,
388  error_left,
389  delegate)
390  && abort_on_hit)
391  {
392  return true;
393  }
394  }
395  return false;
396 }
397 
404 template <bool abort_on_hit,
405  typename cursor_t,
406  typename query_t,
407  typename search_t,
408  typename blocks_length_t,
409  typename delegate_t>
410 inline bool search_ss_deletion(cursor_t cur,
411  query_t & query,
412  typename cursor_t::size_type const lb,
413  typename cursor_t::size_type const rb,
414  uint8_t const errors_spent,
415  uint8_t const block_id,
416  bool const go_right,
417  search_t const & search,
418  blocks_length_t const & blocks_length,
419  search_param const error_left,
420  delegate_t && delegate)
421 {
422  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
423  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0);
424 
425  // Switch to the next block when the min number of errors is reached
426  if (min_error_left_in_block == 0)
427  {
428  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
429  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
430 
431  if (search_ss<abort_on_hit>(cur,
432  query,
433  lb,
434  rb,
435  errors_spent,
436  block_id2,
437  go_right2,
438  search,
439  blocks_length,
440  error_left,
441  delegate)
442  && abort_on_hit)
443  {
444  return true;
445  }
446  }
447 
448  // Insert deletions into the current block as long as possible
449  // Do not allow deletions at the beginning of the leftmost block
450  // Do not allow deletions at the end of the rightmost block
451  if (!(search.pi[block_id] == 1 && !go_right) && !(search.pi[block_id] == search.blocks() && go_right)
452  && max_error_left_in_block > 0 && error_left.total > 0 && error_left.deletion > 0
453  && ((go_right && cur.extend_right()) || (!go_right && cur.extend_left())))
454  {
455  search_param error_left2{error_left};
456  error_left2.total--;
457  error_left2.deletion--;
458  do
459  {
460  if (search_ss_deletion<abort_on_hit>(cur,
461  query,
462  lb,
463  rb,
464  errors_spent + 1,
465  block_id,
466  go_right,
467  search,
468  blocks_length,
469  error_left2,
470  delegate)
471  && abort_on_hit)
472  {
473  return true;
474  }
475  }
476  while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
477  }
478  return false;
479 }
480 
489 template <bool abort_on_hit,
490  typename cursor_t,
491  typename query_t,
492  typename search_t,
493  typename blocks_length_t,
494  typename delegate_t>
495 inline bool search_ss_children(cursor_t cur,
496  query_t & query,
497  typename cursor_t::size_type const lb,
498  typename cursor_t::size_type const rb,
499  uint8_t const errors_spent,
500  uint8_t const block_id,
501  bool const go_right,
502  uint8_t const min_error_left_in_block,
503  search_t const & search,
504  blocks_length_t const & blocks_length,
505  search_param const error_left,
506  delegate_t && delegate)
507 {
508  using size_type = typename cursor_t::size_type;
509  if ((go_right && cur.extend_right()) || (!go_right && cur.extend_left()))
510  {
511  size_type const chars_left = blocks_length[block_id] - (rb - lb - 1);
512 
513  size_type lb2 = lb - !go_right;
514  size_type rb2 = rb + go_right;
515 
516  do
517  {
518  bool const delta = cur.last_rank() != to_rank(query[(go_right ? rb : lb) - 1]);
519 
520  // skip if there are more min errors left in the current block than characters in the block
521  // i.e. chars_left - 1 < min_error_left_in_block - delta
522  // TODO: move that outside the if / do-while struct
523  // TODO: incorporate error_left.deletion into formula
524  if (error_left.deletion == 0 && chars_left + delta < min_error_left_in_block + 1u)
525  continue;
526 
527  if (!delta || error_left.substitution > 0)
528  {
529  search_param error_left2{error_left};
530  error_left2.total -= delta;
531  error_left2.substitution -= delta;
532 
533  // At the end of the current block
534  if (rb - lb == blocks_length[block_id])
535  {
536  // Leave the possibility for one or multiple deletions at the end of a block.
537  // Thus do not change the direction (go_right) yet.
538  if (error_left.deletion > 0)
539  {
540  if (search_ss_deletion<abort_on_hit>(cur,
541  query,
542  lb2,
543  rb2,
544  errors_spent + delta,
545  block_id,
546  go_right,
547  search,
548  blocks_length,
549  error_left2,
550  delegate)
551  && abort_on_hit)
552  {
553  return true;
554  }
555  }
556  else
557  {
558  uint8_t const block_id2 = std::min<uint8_t>(block_id + 1, search.blocks() - 1);
559  bool const go_right2 = block_id2 == 0 ? true : search.pi[block_id2] > search.pi[block_id2 - 1];
560 
561  if (search_ss<abort_on_hit>(cur,
562  query,
563  lb2,
564  rb2,
565  errors_spent + delta,
566  block_id2,
567  go_right2,
568  search,
569  blocks_length,
570  error_left2,
571  delegate)
572  && abort_on_hit)
573  {
574  return true;
575  }
576  }
577  }
578  else
579  {
580  if (search_ss<abort_on_hit>(cur,
581  query,
582  lb2,
583  rb2,
584  errors_spent + delta,
585  block_id,
586  go_right,
587  search,
588  blocks_length,
589  error_left2,
590  delegate)
591  && abort_on_hit)
592  {
593  return true;
594  }
595  }
596  }
597 
598  // Deletion
599  // TODO: check whether the conditions for deletions at the beginning/end of the query are really necessary
600  // No deletion at the beginning of the leftmost block.
601  // No deletion at the end of the rightmost block.
602  if (error_left.deletion > 0 && !(go_right && (rb == 1 || rb == std::ranges::size(query) + 1))
603  && !(!go_right && (lb == 0 || lb == std::ranges::size(query))))
604  {
605  search_param error_left3{error_left};
606  error_left3.total--;
607  error_left3.deletion--;
608  search_ss<abort_on_hit>(cur,
609  query,
610  lb,
611  rb,
612  errors_spent + 1,
613  block_id,
614  go_right,
615  search,
616  blocks_length,
617  error_left3,
618  delegate);
619  }
620  }
621  while ((go_right && cur.cycle_back()) || (!go_right && cur.cycle_front()));
622  }
623  return false;
624 }
625 
631 template <bool abort_on_hit,
632  typename cursor_t,
633  typename query_t,
634  typename search_t,
635  typename blocks_length_t,
636  typename delegate_t>
637 inline bool search_ss(cursor_t cur,
638  query_t & query,
639  typename cursor_t::size_type const lb,
640  typename cursor_t::size_type const rb,
641  uint8_t const errors_spent,
642  uint8_t const block_id,
643  bool const go_right,
644  search_t const & search,
645  blocks_length_t const & blocks_length,
646  search_param const error_left,
647  delegate_t && delegate)
648 {
649  uint8_t const max_error_left_in_block = search.u[block_id] - errors_spent;
650  uint8_t const min_error_left_in_block = std::max(search.l[block_id] - errors_spent, 0); // NOTE: changed
651 
652  // Done.
653  if (min_error_left_in_block == 0 && lb == 0 && rb == std::ranges::size(query) + 1)
654  {
655  delegate(cur);
656  return true;
657  }
658  // Exact search in current block.
659  else if (((max_error_left_in_block == 0) && (rb - lb - 1 != blocks_length[block_id]))
660  || (error_left.total == 0 && min_error_left_in_block == 0))
661  {
662  if (search_ss_exact<abort_on_hit>(cur,
663  query,
664  lb,
665  rb,
666  errors_spent,
667  block_id,
668  go_right,
669  search,
670  blocks_length,
671  error_left,
672  delegate)
673  && abort_on_hit)
674  {
675  return true;
676  }
677  }
678  // Approximate search in current block.
679  // i.e. blocks_length[block_id] - (rb - lb - (lb != rb)) >= min_error_left_in_block
680  else if (error_left.total > 0)
681  {
682  // Insertion
683  if (error_left.insertion > 0)
684  {
685  using size_type = typename cursor_t::size_type;
686 
687  size_type const lb2 = lb - !go_right;
688  size_type const rb2 = rb + go_right;
689 
690  search_param error_left2{error_left};
691  error_left2.total--;
692  error_left2.insertion--;
693  // At the end of the current block
694  if (rb - lb == blocks_length[block_id])
695  {
696  // Leave the possibility for one or multiple deletions at the end of a block.
697  // Thus do not change the direction (go_right) yet.
698  // TODO: benchmark the improvement on preventing insertions followed by a deletion and vice versa. Does
699  // it pay off the additional complexity and documentation for the user? (Note that the user might only
700  // allow for insertions and deletion and not for mismatches).
701  if (search_ss_deletion<abort_on_hit>(cur,
702  query,
703  lb2,
704  rb2,
705  errors_spent + 1,
706  block_id,
707  go_right,
708  search,
709  blocks_length,
710  error_left2,
711  delegate)
712  && abort_on_hit)
713  {
714  return true;
715  }
716  }
717  else
718  {
719  if (search_ss<abort_on_hit>(cur,
720  query,
721  lb2,
722  rb2,
723  errors_spent + 1,
724  block_id,
725  go_right,
726  search,
727  blocks_length,
728  error_left2,
729  delegate)
730  && abort_on_hit)
731  {
732  return true;
733  }
734  }
735  }
736  if (search_ss_children<abort_on_hit>(cur,
737  query,
738  lb,
739  rb,
740  errors_spent,
741  block_id,
742  go_right,
743  min_error_left_in_block,
744  search,
745  blocks_length,
746  error_left,
747  delegate)
748  && abort_on_hit)
749  {
750  return true;
751  }
752  }
753  return false;
754 }
755 
779 template <bool abort_on_hit, typename index_t, typename query_t, typename search_scheme_t, typename delegate_t>
780 inline void search_ss(index_t const & index,
781  query_t & query,
782  search_param const error_left,
783  search_scheme_t const & search_scheme,
784  delegate_t && delegate)
785 {
786  // retrieve cumulative block lengths and starting position
787  auto const block_info = search_scheme_block_info(search_scheme, std::ranges::size(query));
788 
789  for (uint8_t search_id = 0; search_id < search_scheme.size(); ++search_id)
790  {
791  auto const & search = search_scheme[search_id];
792  auto const & [blocks_length, start_pos] = block_info[search_id];
793 
794  bool const hit = search_ss<abort_on_hit>(index.cursor(), // cursor on the index
795  query, // query to be searched
796  start_pos,
797  start_pos + 1, // infix range already searched (open interval)
798  // the first character of `query` has the index 1 (not 0)
799  0, // errors spent
800  0, // current block id in search scheme
801  true, // search the first block from left to right
802  search,
803  blocks_length, // search scheme information
804  error_left, // errors left (broken down by error types)
805  delegate // delegate function called on hit
806  );
807 
808  if (abort_on_hit && hit)
809  return;
810  }
811 }
812 
831 template <typename configuration_t, typename index_t, typename... policies_t>
832  requires (template_specialisation_of<typename index_t::cursor_type, bi_fm_index_cursor>)
833 template <bool abort_on_hit, typename query_t, typename delegate_t>
834 inline void search_scheme_algorithm<configuration_t, index_t, policies_t...>::search_algo_bi(
835  query_t & query,
836  search_param const error_left,
837  delegate_t && delegate)
838 {
839  switch (error_left.total)
840  {
841  case 0:
842  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 0>, delegate);
843  break;
844  case 1:
845  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 1>, delegate);
846  break;
847  case 2:
848  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 2>, delegate);
849  break;
850  case 3:
851  search_ss<abort_on_hit>(*index_ptr, query, error_left, optimum_search_scheme<0, 3>, delegate);
852  break;
853  default:
854  auto const & search_scheme{compute_ss(0, error_left.total)};
855  search_ss<abort_on_hit>(*index_ptr, query, error_left, search_scheme, delegate);
856  break;
857  }
858 }
859 
860 } // namespace seqan3::detail
T addressof(T... args)
Provides the bidirectional seqan3::bi_fm_index.
T clear(T... args)
T empty(T... args)
requires requires
The rank_type of the semi-alphabet; defined as the return type of seqan3::to_rank....
Definition: alphabet/concept.hpp:164
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
requires std::ranges::forward_range< std::ranges::range_reference_t< queries_t > > &&std::same_as< range_innermost_value_t< queries_t >, typename index_t::alphabet_type > auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:103
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
T max(T... args)
T push_back(T... args)
Provides the concept for seqan3::detail::sdsl_index.
Provides data structures used by different search algorithms.
Provides the data structures and precomputed instances for (optimum) search schemes.
Provides seqan3::detail::search_traits.
Provides seqan3::views::slice.
Provides seqan3::detail::transformation_trait_or.