fastq_to_fasta
A template for creation of SeqAn3 apps, with a FASTQ to FASTA example app.
search_single.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
8#pragma once
9
10#include <seqan3/search/views/minimiser_hash.hpp>
11
18
19namespace raptor
20{
21
22template <typename index_t>
23void search_single(search_arguments const & arguments, index_t && index)
24{
25 constexpr bool is_ibf = std::same_as<index_t, raptor_index<index_structure::ibf>>
26 || std::same_as<index_t, raptor_index<index_structure::ibf_compressed>>;
27
28 double index_io_time{0.0};
29 double reads_io_time{0.0};
30 double compute_time{0.0};
31
32 auto cereal_worker = [&]()
33 {
34 load_index(index, arguments, index_io_time);
35 };
36 auto cereal_handle = std::async(std::launch::async, cereal_worker);
37
38 seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{
39 arguments.query_file};
40 using record_type = typename decltype(fin)::record_type;
41 std::vector<record_type> records{};
42
43 sync_out synced_out{arguments.out_file};
44
45 {
46 size_t position{};
47 std::string line{};
48 for (auto const & file_list : arguments.bin_path)
49 {
50 line.clear();
51 line = '#';
52 line += std::to_string(position);
53 line += '\t';
54 for (auto const & filename : file_list)
55 {
56 line += filename;
57 line += ',';
58 }
59 line.back() = '\n';
60 synced_out << line;
61 ++position;
62 }
63 synced_out << "#QUERY_NAME\tUSER_BINS\n";
64 }
65
66 raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()};
67
68 auto worker = [&](size_t const start, size_t const end)
69 {
70 auto counter = [&index]()
71 {
72 if constexpr (is_ibf)
73 return index.ibf().template counting_agent<uint16_t>();
74 else
75 return index.ibf().membership_agent();
76 }();
77 std::string result_string{};
78 std::vector<uint64_t> minimiser;
79
80 auto hash_adaptor = seqan3::views::minimiser_hash(arguments.shape,
81 seqan3::window_size{arguments.window_size},
82 seqan3::seed{adjust_seed(arguments.shape_weight)});
83
84 for (auto && [id, seq] : records | seqan3::views::slice(start, end))
85 {
86 result_string.clear();
87 result_string += id;
88 result_string += '\t';
89
90 auto minimiser_view = seq | hash_adaptor | std::views::common;
91 minimiser.assign(minimiser_view.begin(), minimiser_view.end());
92
93 size_t const minimiser_count{minimiser.size()};
94 size_t const threshold = thresholder.get(minimiser_count);
95
96 if constexpr (is_ibf)
97 {
98 auto & result = counter.bulk_count(minimiser);
99 size_t current_bin{0};
100 for (auto && count : result)
101 {
102 if (count >= threshold)
103 {
104 result_string += std::to_string(current_bin);
105 result_string += ',';
106 }
107 ++current_bin;
108 }
109 }
110 else
111 {
112 auto & result = counter.bulk_contains(minimiser, threshold); // Results contains user bin IDs
113 for (auto && count : result)
114 {
115 result_string += std::to_string(count);
116 result_string += ',';
117 }
118 }
119
120 if (auto & last_char = result_string.back(); last_char == ',')
121 last_char = '\n';
122 else
123 result_string += '\n';
124 synced_out.write(result_string);
125 }
126 };
127
128 for (auto && chunked_records : fin | seqan3::views::chunk((1ULL << 20) * 10))
129 {
130 records.clear();
131 auto start = std::chrono::high_resolution_clock::now();
132 std::ranges::move(chunked_records, std::back_inserter(records));
133 auto end = std::chrono::high_resolution_clock::now();
134 reads_io_time += std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
135
136 cereal_handle.wait();
137
138 do_parallel(worker, records.size(), arguments.threads, compute_time);
139 }
140
141 // GCOVR_EXCL_START
142 if (arguments.write_time)
143 {
144 std::filesystem::path file_path{arguments.out_file};
145 file_path += ".time";
146 std::ofstream file_handle{file_path};
147 file_handle << "Index I/O\tReads I/O\tCompute\n";
148 file_handle << std::fixed << std::setprecision(2) << index_io_time << '\t' << reads_io_time << '\t'
149 << compute_time;
150 }
151 // GCOVR_EXCL_STOP
152}
153
154} // namespace raptor
Definition: sync_out.hpp:18
Definition: threshold.hpp:17
Definition: adjust_seed.hpp:13
void do_parallel(algorithm_t &&worker, size_t const num_records, size_t const threads, double &compute_time)
Definition: do_parallel.hpp:18
void search_single(search_arguments const &arguments, index_t &&index)
Definition: search_single.hpp:23
void load_index(index_t &index, search_arguments const &arguments, size_t const part, double &index_io_time)
Definition: load_index.hpp:19
Definition: search_arguments.hpp:27
uint8_t threads
Definition: search_arguments.hpp:33
raptor::threshold::threshold_parameters make_threshold_parameters() const noexcept
Definition: search_arguments.hpp:58
seqan3::shape shape
Definition: search_arguments.hpp:30
std::filesystem::path query_file
Definition: search_arguments.hpp:51
std::vector< std::vector< std::string > > bin_path
Definition: search_arguments.hpp:50
bool write_time
Definition: search_arguments.hpp:53
std::filesystem::path out_file
Definition: search_arguments.hpp:52