SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <ranges>
17 #include <string>
18 #include <vector>
19 
38 
39 namespace seqan3
40 {
41 
116 class format_sam : private detail::format_sam_base
117 {
118 public:
122  // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
123  format_sam() = default;
124  format_sam(format_sam const &) = default;
125  format_sam & operator=(format_sam const &) = default;
126  format_sam(format_sam &&) = default;
127  format_sam & operator=(format_sam &&) = default;
128  ~format_sam() = default;
129 
131 
134  {"sam"},
135  };
136 
137 protected:
138  template <typename stream_type, // constraints checked by file
139  typename seq_legal_alph_type,
140  typename stream_pos_type,
141  typename seq_type, // other constraints checked inside function
142  typename id_type,
143  typename qual_type>
144  void read_sequence_record(stream_type & stream,
146  stream_pos_type & position_buffer,
147  seq_type & sequence,
148  id_type & id,
149  qual_type & qualities);
150 
151  template <typename stream_type, // constraints checked by file
152  typename seq_type, // other constraints checked inside function
153  typename id_type,
154  typename qual_type>
155  void write_sequence_record(stream_type & stream,
156  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
157  seq_type && sequence,
158  id_type && id,
159  qual_type && qualities);
160 
161  template <typename stream_type, // constraints checked by file
162  typename seq_legal_alph_type,
163  typename ref_seqs_type,
164  typename ref_ids_type,
165  typename stream_pos_type,
166  typename seq_type,
167  typename id_type,
168  typename offset_type,
169  typename ref_seq_type,
170  typename ref_id_type,
171  typename ref_offset_type,
172  typename align_type,
173  typename cigar_type,
174  typename flag_type,
175  typename mapq_type,
176  typename qual_type,
177  typename mate_type,
178  typename tag_dict_type,
179  typename e_value_type,
180  typename bit_score_type>
181  void read_alignment_record(stream_type & stream,
182  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
183  ref_seqs_type & ref_seqs,
185  stream_pos_type & position_buffer,
186  seq_type & seq,
187  qual_type & qual,
188  id_type & id,
189  offset_type & offset,
190  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
191  ref_id_type & ref_id,
192  ref_offset_type & ref_offset,
193  align_type & align,
194  cigar_type & cigar_vector,
195  flag_type & flag,
196  mapq_type & mapq,
197  mate_type & mate,
198  tag_dict_type & tag_dict,
199  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
200  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
201 
202  template <typename stream_type,
203  typename header_type,
204  typename seq_type,
205  typename id_type,
206  typename ref_seq_type,
207  typename ref_id_type,
208  typename align_type,
209  typename qual_type,
210  typename mate_type,
211  typename tag_dict_type,
212  typename e_value_type,
213  typename bit_score_type>
214  void write_alignment_record(stream_type & stream,
215  sam_file_output_options const & options,
216  header_type && header,
217  seq_type && seq,
218  qual_type && qual,
219  id_type && id,
220  int32_t const offset,
221  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
222  ref_id_type && ref_id,
223  std::optional<int32_t> ref_offset,
224  align_type && align,
225  std::vector<cigar> const & cigar_vector,
226  sam_flag const flag,
227  uint8_t const mapq,
228  mate_type && mate,
229  tag_dict_type && tag_dict,
230  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
231  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
232 
233 private:
235  std::string tmp_qual{};
236 
238  static constexpr std::string_view dummy{};
239 
241  sam_file_header<> default_header{};
242 
244  bool ref_info_present_in_header{false};
245 
247  std::string_view const & default_or(detail::ignore_t) const noexcept
248  {
249  return dummy;
250  }
251 
253  template <typename t>
254  decltype(auto) default_or(t && v) const noexcept
255  {
256  return std::forward<t>(v);
257  }
258 
259  template <typename stream_view_type, arithmetic value_type>
260  void
261  read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view, value_type value);
262 
263  template <typename stream_view_type>
264  void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view);
265 
266  template <typename stream_view_type>
267  void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
268 
269  template <typename stream_it_t, std::ranges::forward_range field_type>
270  void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
271 
272  template <typename stream_it_t>
273  void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
274 
275  template <typename stream_it_t>
276  void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
277 };
278 
280 template <typename stream_type, // constraints checked by file
281  typename seq_legal_alph_type,
282  typename stream_pos_type,
283  typename seq_type, // other constraints checked inside function
284  typename id_type,
285  typename qual_type>
286 inline void format_sam::read_sequence_record(stream_type & stream,
288  stream_pos_type & position_buffer,
289  seq_type & sequence,
290  id_type & id,
291  qual_type & qualities)
292 {
294 
295  {
296  read_alignment_record(stream,
297  align_options,
298  std::ignore,
299  default_header,
300  position_buffer,
301  sequence,
302  qualities,
303  id,
304  std::ignore,
305  std::ignore,
306  std::ignore,
307  std::ignore,
308  std::ignore,
309  std::ignore,
310  std::ignore,
311  std::ignore,
312  std::ignore,
313  std::ignore,
314  std::ignore,
315  std::ignore);
316  }
317 
318  if constexpr (!detail::decays_to_ignore_v<seq_type>)
319  if (std::ranges::distance(sequence) == 0)
320  throw parse_error{"The sequence information must not be empty."};
321  if constexpr (!detail::decays_to_ignore_v<id_type>)
322  if (std::ranges::distance(id) == 0)
323  throw parse_error{"The id information must not be empty."};
324 
325  if (options.truncate_ids)
326  id = id | detail::take_until_and_consume(is_space) | ranges::to<id_type>();
327 }
328 
330 template <typename stream_type, // constraints checked by file
331  typename seq_type, // other constraints checked inside function
332  typename id_type,
333  typename qual_type>
334 inline void format_sam::write_sequence_record(stream_type & stream,
335  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
336  seq_type && sequence,
337  id_type && id,
338  qual_type && qualities)
339 {
340  using default_align_t = std::pair<std::span<gapped<char>>, std::span<gapped<char>>>;
341  using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
342 
343  sam_file_output_options output_options;
344 
345  write_alignment_record(stream,
346  output_options,
347  /*header*/ std::ignore,
348  /*seq*/ default_or(sequence),
349  /*qual*/ default_or(qualities),
350  /*id*/ default_or(id),
351  /*offset*/ 0,
352  /*ref_seq*/ std::string_view{},
353  /*ref_id*/ std::string_view{},
354  /*ref_offset*/ -1,
355  /*align*/ default_align_t{},
356  /*cigar_vector*/ std::vector<cigar>{},
357  /*flag*/ sam_flag::none,
358  /*mapq*/ 0,
359  /*mate*/ default_mate_t{},
360  /*tag_dict*/ sam_tag_dictionary{},
361  /*e_value*/ 0,
362  /*bit_score*/ 0);
363 }
364 
366 template <typename stream_type, // constraints checked by file
367  typename seq_legal_alph_type,
368  typename ref_seqs_type,
369  typename ref_ids_type,
370  typename stream_pos_type,
371  typename seq_type,
372  typename id_type,
373  typename offset_type,
374  typename ref_seq_type,
375  typename ref_id_type,
376  typename ref_offset_type,
377  typename align_type,
378  typename cigar_type,
379  typename flag_type,
380  typename mapq_type,
381  typename qual_type,
382  typename mate_type,
383  typename tag_dict_type,
384  typename e_value_type,
385  typename bit_score_type>
386 inline void
388  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
389  ref_seqs_type & ref_seqs,
391  stream_pos_type & position_buffer,
392  seq_type & seq,
393  qual_type & qual,
394  id_type & id,
395  offset_type & offset,
396  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
397  ref_id_type & ref_id,
398  ref_offset_type & ref_offset,
399  align_type & align,
400  cigar_type & cigar_vector,
401  flag_type & flag,
402  mapq_type & mapq,
403  mate_type & mate,
404  tag_dict_type & tag_dict,
405  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
406  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
407 {
408  static_assert(detail::decays_to_ignore_v<ref_offset_type>
409  || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
410  "The ref_offset must be a specialisation of std::optional.");
411 
412  auto stream_view = detail::istreambuf(stream);
413  auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
414 
415  // these variables need to be stored to compute the ALIGNMENT
416  int32_t ref_offset_tmp{};
417  std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
418  [[maybe_unused]] int32_t offset_tmp{};
419  [[maybe_unused]] int32_t soft_clipping_end{};
420  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
421  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
422 
423  // Header
424  // -------------------------------------------------------------------------------------------------------------
425  if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
426  {
427  read_header(stream_view, header, ref_seqs);
428 
429  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
430  return;
431  }
432 
433  // Store the current file position in the buffer.
434  position_buffer = stream.tellg();
435 
436  // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
437  // -------------------------------------------------------------------------------------------------------------
438  if constexpr (!detail::decays_to_ignore_v<id_type>)
439  read_forward_range_field(field_view, id);
440  else
441  detail::consume(field_view);
442 
443  uint16_t flag_integral{};
444  read_arithmetic_field(field_view, flag_integral);
445  flag = sam_flag{flag_integral};
446 
447  read_forward_range_field(field_view, ref_id_tmp);
448  check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
449 
450  read_arithmetic_field(field_view, ref_offset_tmp);
451  --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
452 
453  if (ref_offset_tmp == -1)
454  ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
455  else if (ref_offset_tmp > -1)
456  ref_offset = ref_offset_tmp;
457  else if (ref_offset_tmp < -1)
458  throw format_error{"No negative values are allowed for field::ref_offset."};
459 
460  if constexpr (!detail::decays_to_ignore_v<mapq_type>)
461  read_arithmetic_field(field_view, mapq);
462  else
463  detail::consume(field_view);
464 
465  // Field 6: CIGAR
466  // -------------------------------------------------------------------------------------------------------------
467  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
468  {
469  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
470  {
471  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
472  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
473  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
474  }
475  else
476  {
477  ++std::ranges::begin(field_view); // skip '*'
478  }
479  }
480  else
481  {
482  detail::consume(field_view);
483  }
484 
485  offset = offset_tmp;
486 
487  // Field 7-9: (RNEXT PNEXT TLEN) = MATE
488  // -------------------------------------------------------------------------------------------------------------
489  if constexpr (!detail::decays_to_ignore_v<mate_type>)
490  {
491  std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
492  read_forward_range_field(field_view, tmp_mate_ref_id); // RNEXT
493 
494  if (tmp_mate_ref_id == "=") // indicates "same as ref id"
495  {
496  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
497  get<0>(mate) = ref_id;
498  else
499  check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
500  }
501  else
502  {
503  check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
504  }
505 
506  int32_t tmp_pnext{};
507  read_arithmetic_field(field_view, tmp_pnext); // PNEXT
508 
509  if (tmp_pnext > 0)
510  get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
511  else if (tmp_pnext < 0)
512  throw format_error{"No negative values are allowed at the mate mapping position."};
513  // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
514 
515  read_arithmetic_field(field_view, get<2>(mate)); // TLEN
516  }
517  else
518  {
519  for (size_t i = 0; i < 3u; ++i)
520  {
521  detail::consume(field_view);
522  }
523  }
524 
525  // Field 10: Sequence
526  // -------------------------------------------------------------------------------------------------------------
527  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
528  {
529  constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
530  auto seq_stream =
531  field_view
533  [is_legal_alph](char const c) // enforce legal alphabet
534  {
535  if (!is_legal_alph(c))
536  throw parse_error{std::string{"Encountered an unexpected letter: "} + "char_is_valid_for<"
537  + detail::type_name_as_string<seq_legal_alph_type>
538  + "> evaluated to false on " + detail::make_printable(c)};
539  return c;
540  });
541 
542  if constexpr (detail::decays_to_ignore_v<seq_type>)
543  {
544  if constexpr (!detail::decays_to_ignore_v<align_type>)
545  {
546  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
547  "If you want to read ALIGNMENT but not SEQ, the alignment"
548  " object must store a sequence container at the second (query) position.");
549 
550  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
551  {
552 
553  auto tmp_iter = std::ranges::begin(seq_stream);
554  std::ranges::advance(tmp_iter, offset_tmp);
555 
556  for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
557  {
558  get<1>(align).push_back(
559  std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
560  ++tmp_iter;
561  }
562 
563  std::ranges::advance(tmp_iter, soft_clipping_end);
564  }
565  else
566  {
567  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
568  }
569  }
570  else
571  {
572  detail::consume(seq_stream);
573  }
574  }
575  else
576  {
577  read_forward_range_field(seq_stream, seq);
578 
579  if constexpr (!detail::decays_to_ignore_v<align_type>)
580  {
581  if (!tmp_cigar_vector
582  .empty()) // if no alignment info is given, the field::alignment should remain empty
583  {
584  assign_unaligned(get<1>(align),
585  seq
586  | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
587  std::ranges::size(seq) - soft_clipping_end));
588  }
589  }
590  }
591  }
592  else
593  {
594  ++std::ranges::begin(field_view); // skip '*'
595  }
596 
597  // Field 11: Quality
598  // -------------------------------------------------------------------------------------------------------------
599  auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
600  auto qual_view = stream_view | detail::take_until_or_throw(tab_or_end);
601  if constexpr (!detail::decays_to_ignore_v<qual_type>)
602  read_forward_range_field(qual_view, qual);
603  else
604  detail::consume(qual_view);
605 
606  if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
607  {
608  if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0
609  && std::ranges::distance(seq) != std::ranges::distance(qual))
610  {
611  throw format_error{detail::to_string("Sequence length (",
612  std::ranges::distance(seq),
613  ") and quality length (",
614  std::ranges::distance(qual),
615  ") must be the same.")};
616  }
617  }
618 
619  // All remaining optional fields if any: SAM tags dictionary
620  // -------------------------------------------------------------------------------------------------------------
621  while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
622  {
623  ++std::ranges::begin(stream_view); // skip tab
624  auto stream_until_tab_or_end = stream_view | detail::take_until_or_throw(tab_or_end);
625  if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
626  read_sam_dict_field(stream_until_tab_or_end, tag_dict);
627  else
628  detail::consume(stream_until_tab_or_end);
629  }
630 
631  detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
632 
633  // DONE READING - wrap up
634  // -------------------------------------------------------------------------------------------------------------
635  // Alignment object construction
636  // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
637  if constexpr (!detail::decays_to_ignore_v<align_type>)
638  {
639  int32_t ref_idx{(ref_id_tmp.empty() /*unmapped read?*/) ? -1 : 0};
640 
641  if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
642  {
643  if (!ref_id_tmp.empty())
644  {
645  assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
646  ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
647  }
648  }
649 
650  construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
651  }
652 
653  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
654  std::swap(cigar_vector, tmp_cigar_vector);
655 }
656 
658 template <typename stream_type,
659  typename header_type,
660  typename seq_type,
661  typename id_type,
662  typename ref_seq_type,
663  typename ref_id_type,
664  typename align_type,
665  typename qual_type,
666  typename mate_type,
667  typename tag_dict_type,
668  typename e_value_type,
669  typename bit_score_type>
670 inline void format_sam::write_alignment_record(stream_type & stream,
671  sam_file_output_options const & options,
672  header_type && header,
673  seq_type && seq,
674  qual_type && qual,
675  id_type && id,
676  int32_t const offset,
677  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
678  ref_id_type && ref_id,
679  std::optional<int32_t> ref_offset,
680  align_type && align,
681  std::vector<cigar> const & cigar_vector,
682  sam_flag const flag,
683  uint8_t const mapq,
684  mate_type && mate,
685  tag_dict_type && tag_dict,
686  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
687  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
688 {
689  /* Note the following general things:
690  *
691  * - Given the SAM specifications, all fields may be empty
692  *
693  * - arithmetic values default to 0 while all others default to '*'
694  *
695  * - Because of the former, arithmetic values can be directly streamed
696  * into 'stream' as operator<< is defined for all arithmetic types
697  * and the default value (0) is also the SAM default.
698  *
699  * - All other non-arithmetic values need to be checked for emptiness
700  */
701 
702  // ---------------------------------------------------------------------
703  // Type Requirements (as static asserts for user friendliness)
704  // ---------------------------------------------------------------------
705  static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
706  "The seq object must be a std::ranges::forward_range over "
707  "letters that model seqan3::alphabet.");
708 
709  static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
710  "The id object must be a std::ranges::forward_range over "
711  "letters that model seqan3::alphabet.");
712 
713  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
714  {
715  static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
716  || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
717  "The ref_id object must be a std::ranges::forward_range "
718  "over letters that model seqan3::alphabet.");
719 
720  if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
721  || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
722  static_assert(!detail::decays_to_ignore_v<header_type>,
723  "If you give indices as reference id information the header must also be present.");
724  }
725 
726  static_assert(tuple_like<std::remove_cvref_t<align_type>>,
727  "The align object must be a std::pair of two ranges whose "
728  "value_type is comparable to seqan3::gap");
729 
730  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
731  && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
732  && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
733  "The align object must be a std::pair of two ranges whose "
734  "value_type is comparable to seqan3::gap");
735 
736  static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
737  "The qual object must be a std::ranges::forward_range "
738  "over letters that model seqan3::alphabet.");
739 
740  static_assert(tuple_like<std::remove_cvref_t<mate_type>>,
741  "The mate object must be a std::tuple of size 3 with "
742  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
743  "2) a std::integral or std::optional<std::integral>, and "
744  "3) a std::integral.");
745 
746  static_assert(
747  ((std::ranges::forward_range<decltype(std::get<0>(mate))>
748  || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
749  || detail::is_type_specialisation_of_v<
750  std::remove_cvref_t<decltype(std::get<0>(mate))>,
751  std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
752  || detail::is_type_specialisation_of_v<
753  std::remove_cvref_t<decltype(std::get<1>(mate))>,
754  std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
755  "The mate object must be a std::tuple of size 3 with "
756  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
757  "2) a std::integral or std::optional<std::integral>, and "
758  "3) a std::integral.");
759 
760  if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
761  || detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>,
762  std::optional>)
763  static_assert(!detail::decays_to_ignore_v<header_type>,
764  "If you give indices as mate reference id information the header must also be present.");
765 
766  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
767  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
768 
769  // ---------------------------------------------------------------------
770  // logical Requirements
771  // ---------------------------------------------------------------------
772  if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
773  && !std::integral<std::remove_reference_t<ref_id_type>>
774  && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
775  {
776 
777  if (options.sam_require_header && !std::ranges::empty(ref_id))
778  {
779  auto id_it = header.ref_dict.end();
780 
781  if constexpr (std::ranges::contiguous_range<decltype(ref_id)> && std::ranges::sized_range<decltype(ref_id)>
782  && std::ranges::borrowed_range<decltype(ref_id)>)
783  {
784  id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
785  }
786  else
787  {
788  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
789 
791  "The ref_id type is not convertible to the reference id information stored in the "
792  "reference dictionary of the header object.");
793 
794  id_it = header.ref_dict.find(ref_id);
795  }
796 
797  if (id_it == header.ref_dict.end()) // no reference id matched
798  throw format_error{detail::to_string("The ref_id '",
799  ref_id,
800  "' was not in the list of references:",
801  header.ref_ids())};
802  }
803  }
804 
805  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
806  throw format_error{"The ref_offset object must be a std::integral >= 0."};
807 
808  // ---------------------------------------------------------------------
809  // Writing the Header on first call
810  // ---------------------------------------------------------------------
811  if constexpr (!detail::decays_to_ignore_v<header_type>)
812  {
813  if (options.sam_require_header && !header_was_written)
814  {
815  write_header(stream, options, header);
816  header_was_written = true;
817  }
818  }
819 
820  // ---------------------------------------------------------------------
821  // Writing the Record
822  // ---------------------------------------------------------------------
823 
824  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
825  constexpr char separator{'\t'};
826 
827  write_range_or_asterisk(stream_it, id);
828  *stream_it = separator;
829 
830  stream_it.write_number(static_cast<uint16_t>(flag));
831  *stream_it = separator;
832 
833  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
834  {
835  if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
836  {
837  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
838  }
839  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
840  {
841  if (ref_id.has_value())
842  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
843  else
844  *stream_it = '*';
845  }
846  else
847  {
848  write_range_or_asterisk(stream_it, ref_id);
849  }
850  }
851  else
852  {
853  *stream_it = '*';
854  }
855 
856  *stream_it = separator;
857 
858  // SAM is 1 based, 0 indicates unmapped read if optional is not set
859  stream_it.write_number(ref_offset.value_or(-1) + 1);
860  *stream_it = separator;
861 
862  stream_it.write_number(static_cast<unsigned>(mapq));
863  *stream_it = separator;
864 
865  if (!std::ranges::empty(cigar_vector))
866  {
867  for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
868  stream_it.write_range(c.to_string());
869  }
870  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
871  {
872  // compute possible distance from alignment end to sequence end
873  // which indicates soft clipping at the end.
874  // This should be replace by a free count_gaps function for
875  // aligned sequences which is more efficient if possible.
876  size_t off_end{std::ranges::size(seq) - offset};
877  for (auto chr : get<1>(align))
878  if (chr == gap{})
879  ++off_end;
880 
881  // Might happen if get<1>(align) doesn't correspond to the reference.
882  assert(off_end >= std::ranges::size(get<1>(align)));
883  off_end -= std::ranges::size(get<1>(align));
884 
885  write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
886  }
887  else
888  {
889  *stream_it = '*';
890  }
891 
892  *stream_it = separator;
893 
894  if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
895  {
896  write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
897  }
898  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>,
899  std::optional>)
900  {
901  if (get<0>(mate).has_value())
902  write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value()]);
903  else
904  *stream_it = '*';
905  }
906  else
907  {
908  write_range_or_asterisk(stream_it, get<0>(mate));
909  }
910 
911  *stream_it = separator;
912 
913  if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
914  {
915  // SAM is 1 based, 0 indicates unmapped read if optional is not set
916  stream_it.write_number(get<1>(mate).value_or(-1) + 1);
917  *stream_it = separator;
918  }
919  else
920  {
921  stream_it.write_number(get<1>(mate));
922  *stream_it = separator;
923  }
924 
925  stream_it.write_number(get<2>(mate));
926  *stream_it = separator;
927 
928  write_range_or_asterisk(stream_it, seq);
929  *stream_it = separator;
930 
931  write_range_or_asterisk(stream_it, qual);
932 
933  write_tag_fields(stream_it, tag_dict, separator);
934 
935  stream_it.write_end_of_line(options.add_carriage_return);
936 }
937 
955 template <typename stream_view_type, arithmetic value_type>
956 inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
957  stream_view_type && stream_view,
958  value_type value)
959 {
960  std::vector<value_type> tmp_vector;
961  while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
962  {
963  read_arithmetic_field(stream_view | detail::take_until(is_char<','>), value);
964  tmp_vector.push_back(value);
965 
966  if (is_char<','>(*std::ranges::begin(stream_view)))
967  ++std::ranges::begin(stream_view); // skip ','
968  }
969  variant = std::move(tmp_vector);
970 }
971 
985 template <typename stream_view_type>
986 inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view)
987 {
988  std::vector<std::byte> tmp_vector;
989  std::byte value;
990 
991  while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
992  {
993  try
994  {
995  read_byte_field(stream_view | detail::take_exactly_or_throw(2), value);
996  }
997  catch (std::exception const & e)
998  {
999  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1000  }
1001 
1002  tmp_vector.push_back(value);
1003  }
1004 
1005  variant = std::move(tmp_vector);
1006 }
1007 
1025 template <typename stream_view_type>
1026 inline void format_sam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1027 {
1028  /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
1029  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1030  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
1031  VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
1032  */
1033  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
1034  ++std::ranges::begin(stream_view); // skip char read before
1035  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
1036  ++std::ranges::begin(stream_view); // skip char read before
1037  ++std::ranges::begin(stream_view); // skip ':'
1038  char type_id = *std::ranges::begin(stream_view);
1039  ++std::ranges::begin(stream_view); // skip char read before
1040  ++std::ranges::begin(stream_view); // skip ':'
1041 
1042  switch (type_id)
1043  {
1044  case 'A': // char
1045  {
1046  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1047  ++std::ranges::begin(stream_view); // skip char that has been read
1048  break;
1049  }
1050  case 'i': // int32_t
1051  {
1052  int32_t tmp;
1053  read_arithmetic_field(stream_view, tmp);
1054  target[tag] = tmp;
1055  break;
1056  }
1057  case 'f': // float
1058  {
1059  float tmp;
1060  read_arithmetic_field(stream_view, tmp);
1061  target[tag] = tmp;
1062  break;
1063  }
1064  case 'Z': // string
1065  {
1066  target[tag] = stream_view | ranges::to<std::string>();
1067  break;
1068  }
1069  case 'H':
1070  {
1071  read_sam_byte_vector(target[tag], stream_view);
1072  break;
1073  }
1074  case 'B': // Array. Value type depends on second char [cCsSiIf]
1075  {
1076  char array_value_type_id = *std::ranges::begin(stream_view);
1077  ++std::ranges::begin(stream_view); // skip char read before
1078  ++std::ranges::begin(stream_view); // skip first ','
1079 
1080  switch (array_value_type_id)
1081  {
1082  case 'c': // int8_t
1083  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1084  break;
1085  case 'C': // uint8_t
1086  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1087  break;
1088  case 's': // int16_t
1089  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1090  break;
1091  case 'S': // uint16_t
1092  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1093  break;
1094  case 'i': // int32_t
1095  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1096  break;
1097  case 'I': // uint32_t
1098  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1099  break;
1100  case 'f': // float
1101  read_sam_dict_vector(target[tag], stream_view, float{});
1102  break;
1103  default:
1104  throw format_error{std::string("The first character in the numerical ")
1105  + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
1106  + "' was given."};
1107  }
1108  break;
1109  }
1110  default:
1111  throw format_error{std::string("The second character in the numerical id of a "
1112  "SAM tag must be one of [A,i,Z,H,B,f] but '")
1113  + type_id + "' was given."};
1114  }
1115 }
1116 
1124 template <typename stream_it_t, std::ranges::forward_range field_type>
1125 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1126 {
1127  if (std::ranges::empty(field_value))
1128  {
1129  *stream_it = '*';
1130  }
1131  else
1132  {
1133  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1134  stream_it.write_range(field_value);
1135  else // convert from alphabets to their character representation
1136  stream_it.write_range(field_value | views::to_char);
1137  }
1138 }
1139 
1146 template <typename stream_it_t>
1147 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1148 {
1149  write_range_or_asterisk(stream_it, std::string_view{field_value});
1150 }
1151 
1159 template <typename stream_it_t>
1160 inline void
1161 format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1162 {
1163  auto const stream_variant_fn = [&stream_it](auto && arg) // helper to print a std::variant
1164  {
1165  using T = std::remove_cvref_t<decltype(arg)>;
1166 
1167  if constexpr (std::ranges::input_range<T>)
1168  {
1169  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1170  {
1171  stream_it.write_range(arg);
1172  }
1173  else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1174  {
1175  if (!std::ranges::empty(arg))
1176  {
1177  stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1178 
1179  for (auto && elem : arg | std::views::drop(1))
1180  {
1181  *stream_it = ',';
1182  stream_it.write_number(std::to_integer<uint8_t>(elem));
1183  }
1184  }
1185  }
1186  else
1187  {
1188  if (!std::ranges::empty(arg))
1189  {
1190  stream_it.write_number(*std::ranges::begin(arg));
1191 
1192  for (auto && elem : arg | std::views::drop(1))
1193  {
1194  *stream_it = ',';
1195  stream_it.write_number(elem);
1196  }
1197  }
1198  }
1199  }
1200  else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1201  {
1202  *stream_it = arg;
1203  }
1204  else // number
1205  {
1206  stream_it.write_number(arg);
1207  }
1208  };
1209 
1210  for (auto & [tag, variant] : tag_dict)
1211  {
1212  *stream_it = separator;
1213 
1214  char const char0 = tag / 256;
1215  char const char1 = tag % 256;
1216 
1217  *stream_it = char0;
1218  *stream_it = char1;
1219  *stream_it = ':';
1220  *stream_it = detail::sam_tag_type_char[variant.index()];
1221  *stream_it = ':';
1222 
1223  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1224  {
1225  *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1226  *stream_it = ',';
1227  }
1228 
1229  std::visit(stream_variant_fn, variant);
1230  }
1231 }
1232 
1233 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
The SAM format (tag).
Definition: format_sam.hpp:117
format_sam & operator=(format_sam const &)=default
Defaulted.
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:133
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:334
format_sam(format_sam &&)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:286
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:670
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:387
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_char
Return the char representation of an alphabet object.
Definition: alphabet/concept.hpp:386
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:125
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:469
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides seqan3::ranges::to.
The <ranges> header from C++20's standard library.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: io/exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: sam_file/output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: sam_file/output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: sequence_file/input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sequence_file/output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::views::to_char.
T tuple_size_v
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)