SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
bgzf_stream_util.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <algorithm>
16 #include <array>
17 #include <cassert>
18 #include <cstring>
19 #include <memory>
20 #include <span>
21 #include <thread>
22 
25 #include <seqan3/io/exception.hpp>
27 
28 #if !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
29 # error "This file cannot be used when building without GZip-support."
30 #endif // !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
31 
32 #if defined(SEQAN3_HAS_ZLIB)
33 // Zlib headers
34 #include <zlib.h>
35 
36 namespace seqan3::contrib
37 {
38 
41 [[maybe_unused]] inline uint64_t bgzf_thread_count = 4;
42 
43 // ============================================================================
44 // Forwards
45 // ============================================================================
46 
47 // ============================================================================
48 // Classes
49 // ============================================================================
50 
51 // Special end-of-file marker defined by the BGZF compression format.
52 // See: https://samtools.github.io/hts-specs/SAMv1.pdf
53 static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER {{'\x1f', '\x8b', '\x08', '\x04',
54  '\x00', '\x00', '\x00', '\x00',
55  '\x00', '\xff', '\x06', '\x00',
56  '\x42', '\x43', '\x02', '\x00',
57  '\x1b', '\x00', '\x03', '\x00',
58  '\x00', '\x00', '\x00', '\x00',
59  '\x00', '\x00', '\x00', '\x00'}};
60 
61 template <typename TAlgTag>
62 struct CompressionContext {};
63 
64 template <typename TAlgTag>
65 struct DefaultPageSize;
66 
67 template <>
68 struct CompressionContext<detail::gz_compression>
69 {
70  z_stream strm;
71 
72  CompressionContext()
73  {
74  std::memset(&strm, 0, sizeof(z_stream));
75  }
76 };
77 
78 template <>
79 struct CompressionContext<detail::bgzf_compression>:
80  CompressionContext<detail::gz_compression>
81 {
82  static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
83  unsigned char headerPos;
84 };
85 
86 template <>
87 struct DefaultPageSize<detail::bgzf_compression>
88 {
89  static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
90  static const unsigned BLOCK_FOOTER_LENGTH = 8;
91  // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
92  static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
93 
94  // Reduce the maximal input size, such that the compressed data
95  // always fits in one block even for level Z_NO_COMPRESSION.
96  enum { BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH };
97  static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
98 };
99 
100 // ============================================================================
101 // Functions
102 // ============================================================================
103 
104 // ----------------------------------------------------------------------------
105 // Function compressInit()
106 // ----------------------------------------------------------------------------
107 
108 inline void
109 compressInit(CompressionContext<detail::gz_compression> & ctx)
110 {
111  const int GZIP_WINDOW_BITS = -15; // no zlib header
112  const int Z_DEFAULT_MEM_LEVEL = 8;
113 
114  ctx.strm.zalloc = NULL;
115  ctx.strm.zfree = NULL;
116 
117  // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
118  // to be 2x faster and produces only 7% bigger output
119 // int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
120 // GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
121  int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
122  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
123  if (status != Z_OK)
124  throw io_error("Calling deflateInit2() failed for gz file.");
125 }
126 
127 // ----------------------------------------------------------------------------
128 // Function compressInit()
129 // ----------------------------------------------------------------------------
130 
131 inline void
132 compressInit(CompressionContext<detail::bgzf_compression> & ctx)
133 {
134  compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
135  ctx.headerPos = 0;
136 }
137 
138 // ----------------------------------------------------------------------------
139 // Helper Function _bgzfUnpackXX()
140 // ----------------------------------------------------------------------------
141 
142 inline uint16_t
143 _bgzfUnpack16(char const * buffer)
144 {
145  uint16_t tmp;
146  std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
147  return detail::to_little_endian(tmp);
148 }
149 
150 inline uint32_t
151 _bgzfUnpack32(char const * buffer)
152 {
153  uint32_t tmp;
154  std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
155  return detail::to_little_endian(tmp);
156 }
157 
158 // ----------------------------------------------------------------------------
159 // Helper Function _bgzfPackXX()
160 // ----------------------------------------------------------------------------
161 
162 inline void
163 _bgzfPack16(char * buffer, uint16_t value)
164 {
165  value = detail::to_little_endian(value);
166  std::uninitialized_copy(reinterpret_cast<char *>(&value),
167  reinterpret_cast<char *>(&value) + sizeof(uint16_t),
168  buffer);
169 }
170 
171 inline void
172 _bgzfPack32(char * buffer, uint32_t value)
173 {
174  value = detail::to_little_endian(value);
175  std::uninitialized_copy(reinterpret_cast<char *>(&value),
176  reinterpret_cast<char *>(&value) + sizeof(uint32_t),
177  buffer);
178 }
179 
180 // ----------------------------------------------------------------------------
181 // Function _compressBlock()
182 // ----------------------------------------------------------------------------
183 
184 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
185 inline TDestCapacity
186 _compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
187  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
188 {
189  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
190  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
191 
192  assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
193  assert(sizeof(TDestValue) == 1u);
194  assert(sizeof(unsigned) == 4u);
195 
196  // 1. COPY HEADER
197  std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
198 
199  // 2. COMPRESS
200  compressInit(ctx);
201  ctx.strm.next_in = (Bytef *)(srcBegin);
202  ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
203  ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
204  ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
205 
206  int status = deflate(&ctx.strm, Z_FINISH);
207  if (status != Z_STREAM_END)
208  {
209  deflateEnd(&ctx.strm);
210  throw io_error("Deflation failed. Compressed BGZF data is too big.");
211  }
212 
213  status = deflateEnd(&ctx.strm);
214  if (status != Z_OK)
215  throw io_error("BGZF deflateEnd() failed.");
216 
217 
218  // 3. APPEND FOOTER
219 
220  // Set compressed length into buffer, compute CRC and write CRC into buffer.
221 
222  size_t len = dstCapacity - ctx.strm.avail_out;
223  _bgzfPack16(dstBegin + 16, len - 1);
224 
225  dstBegin += len - BLOCK_FOOTER_LENGTH;
226  _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
227  _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
228 
229  return dstCapacity - ctx.strm.avail_out;
230 }
231 
232 // ----------------------------------------------------------------------------
233 // Function decompressInit() - GZIP
234 // ----------------------------------------------------------------------------
235 
236 inline void
237 decompressInit(CompressionContext<detail::gz_compression> & ctx)
238 {
239  const int GZIP_WINDOW_BITS = -15; // no zlib header
240 
241  ctx.strm.zalloc = NULL;
242  ctx.strm.zfree = NULL;
243  int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
244  if (status != Z_OK)
245  throw io_error("GZip inflateInit2() failed.");
246 }
247 
248 // ----------------------------------------------------------------------------
249 // Function decompressInit() - BGZF
250 // ----------------------------------------------------------------------------
251 
252 inline void
253 decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
254 {
255  decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
256  ctx.headerPos = 0;
257 }
258 
259 // ----------------------------------------------------------------------------
260 // Function _decompressBlock()
261 // ----------------------------------------------------------------------------
262 
263 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
264 inline TDestCapacity
265 _decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
266  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
267 {
268  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
269  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
270 
271  assert(sizeof(TSourceValue) == 1u);
272  assert(sizeof(unsigned) == 4u);
273 
274  // 1. CHECK HEADER
275 
276  if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
277  throw io_error("BGZF block too short.");
278 
279  if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
280  throw io_error("Invalid BGZF block header.");
281 
282  size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
283  if (compressedLen != srcLength)
284  throw io_error("BGZF compressed size mismatch.");
285 
286 
287  // 2. DECOMPRESS
288 
289  decompressInit(ctx);
290  ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
291  ctx.strm.next_out = (Bytef *)(dstBegin);
292  ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
293  ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
294 
295  int status = inflate(&ctx.strm, Z_FINISH);
296  if (status != Z_STREAM_END)
297  {
298  inflateEnd(&ctx.strm);
299  throw io_error("Inflation failed. Decompressed BGZF data is too big.");
300  }
301 
302  status = inflateEnd(&ctx.strm);
303  if (status != Z_OK)
304  throw io_error("BGZF inflateEnd() failed.");
305 
306 
307  // 3. CHECK FOOTER
308 
309  // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
310 
311  unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
312 
313  srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
314  if (_bgzfUnpack32(srcBegin) != crc)
315  throw io_error("BGZF wrong checksum.");
316 
317  if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
318  throw io_error("BGZF size mismatch.");
319 
320  return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
321 }
322 
323 } // namespace seqan3::contrib
324 
325 #endif // defined(SEQAN3_HAS_ZLIB)
Provides various transformation traits used by the range module.
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
T memset(T... args)
Provides std::span from the C++20 standard library.
Provides utility functions for bit twiddling.
T uninitialized_copy(T... args)