OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_decoder_ssse3.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2022, Aous Naman
6 // Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2022, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_block_decoder_ssse3.cpp
34 // Author: Aous Naman
35 // Date: 13 May 2022
36 //***************************************************************************/
37 
38 //***************************************************************************/
43 #include <string>
44 #include <iostream>
45 
46 #include <cassert>
47 #include <cstring>
48 #include "ojph_block_common.h"
49 #include "ojph_block_decoder.h"
50 #include "ojph_arch.h"
51 #include "ojph_message.h"
52 
53 #include <immintrin.h>
54 
55 namespace ojph {
56  namespace local {
57 
58  //************************************************************************/
65  struct dec_mel_st {
66  dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
67  k(0), num_runs(0), runs(0)
68  {}
69  // data decoding machinary
70  ui8* data;
71  ui64 tmp;
72  int bits;
73  int size;
74  bool unstuff;
75  int k;
76 
77  // queue of decoded runs
78  int num_runs;
79  ui64 runs;
80  };
81 
82  //************************************************************************/
94  static inline
95  void mel_read(dec_mel_st *melp)
96  {
97  if (melp->bits > 32) //there are enough bits in the tmp variable
98  return; // return without reading new data
99 
100  ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
101  if (melp->size > 4) { // if there is data in the MEL segment
102  val = *(ui32*)melp->data; // read 32 bits from MEL data
103  melp->data += 4; // advance pointer
104  melp->size -= 4; // reduce counter
105  }
106  else if (melp->size > 0)
107  { // 4 or less
108  int i = 0;
109  while (melp->size > 1) {
110  ui32 v = *melp->data++; // read one byte at a time
111  ui32 m = ~(0xFFu << i); // mask of location
112  val = (val & m) | (v << i);// put one byte in its correct location
113  --melp->size;
114  i += 8;
115  }
116  // size equal to 1
117  ui32 v = *melp->data++; // the one before the last is different
118  v |= 0xF; // MEL and VLC segments can overlap
119  ui32 m = ~(0xFFu << i);
120  val = (val & m) | (v << i);
121  --melp->size;
122  }
123 
124  // next we unstuff them before adding them to the buffer
125  int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
126  // the previously read byte requires
127  // unstuffing
128 
129  // data is unstuffed and accumulated in t
130  // bits has the number of bits in t
131  ui32 t = val & 0xFF;
132  bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
133  bits -= unstuff; // there is one less bit in t if unstuffing is needed
134  t = t << (8 - unstuff); // move up to make room for the next byte
135 
136  //this is a repeat of the above
137  t |= (val>>8) & 0xFF;
138  unstuff = (((val >> 8) & 0xFF) == 0xFF);
139  bits -= unstuff;
140  t = t << (8 - unstuff);
141 
142  t |= (val>>16) & 0xFF;
143  unstuff = (((val >> 16) & 0xFF) == 0xFF);
144  bits -= unstuff;
145  t = t << (8 - unstuff);
146 
147  t |= (val>>24) & 0xFF;
148  melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
149 
150  // move t to tmp, and push the result all the way up, so we read from
151  // the MSB
152  melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
153  melp->bits += bits; //increment the number of bits in tmp
154  }
155 
156  //************************************************************************/
171  static inline
172  void mel_decode(dec_mel_st *melp)
173  {
174  static const int mel_exp[13] = { //MEL exponents
175  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
176  };
177 
178  if (melp->bits < 6) // if there are less than 6 bits in tmp
179  mel_read(melp); // then read from the MEL bitstream
180  // 6 bits is the largest decodable MEL cwd
181 
182  //repeat so long that there is enough decodable bits in tmp,
183  // and the runs store is not full (num_runs < 8)
184  while (melp->bits >= 6 && melp->num_runs < 8)
185  {
186  int eval = mel_exp[melp->k]; // number of bits associated with state
187  int run = 0;
188  if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
189  { //one is found
190  run = 1 << eval;
191  run--; // consecutive runs of 0 events - 1
192  melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
193  melp->tmp <<= 1; // consume one bit from tmp
194  melp->bits -= 1;
195  run = run << 1; // a stretch of zeros not terminating in one
196  }
197  else
198  { //0 is found
199  run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
200  melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
201  melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
202  melp->bits -= eval + 1;
203  run = (run << 1) + 1; // a stretch of zeros terminating with one
204  }
205  eval = melp->num_runs * 7; // 7 bits per run
206  melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
207  melp->runs |= ((ui64)run) << eval; // store the value in runs
208  melp->num_runs++; // increment count
209  }
210  }
211 
212  //************************************************************************/
222  static inline
223  void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
224  {
225  melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
226  melp->bits = 0; // 0 bits in tmp
227  melp->tmp = 0; //
228  melp->unstuff = false; // no unstuffing
229  melp->size = scup - 1; // size is the length of MEL+VLC-1
230  melp->k = 0; // 0 for state
231  melp->num_runs = 0; // num_runs is 0
232  melp->runs = 0; //
233 
234  //This code is borrowed; original is for a different architecture
235  //These few lines take care of the case where data is not at a multiple
236  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
237  int num = 4 - (int)(intptr_t(melp->data) & 0x3);
238  for (int i = 0; i < num; ++i) { // this code is similar to mel_read
239  assert(melp->unstuff == false || melp->data[0] <= 0x8F);
240  ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
241  //set data to 0xFF
242  if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
243  // see the standard
244  melp->data += melp->size-- > 0; //increment if the end is not reached
245  int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
246  melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
247  melp->bits += d_bits; //increment tmp by number of bits
248  melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
249  //unstuffing
250  }
251  melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
252  // is the MSB
253  }
254 
255  //************************************************************************/
261  static inline
263  {
264  if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
265  mel_decode(melp);
266 
267  int t = melp->runs & 0x7F; //retrieve one run
268  melp->runs >>= 7; // remove the retrieved run
269  melp->num_runs--;
270  return t; // return run
271  }
272 
273  //************************************************************************/
277  struct rev_struct {
278  rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
279  {}
280  //storage
281  ui8* data;
282  ui64 tmp;
283  ui32 bits;
284  int size;
285  bool unstuff;
287  };
288 
289  //************************************************************************/
309  static inline
310  void rev_read(rev_struct *vlcp)
311  {
312  //process 4 bytes at a time
313  if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
314  return; // reading 32 bits can overflow vlcp->tmp
315  ui32 val = 0;
316  //the next line (the if statement) needs to be tested first
317  if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
318  {
319  // (vlcp->data - 3) move pointer back to read 32 bits at once
320  val = *(ui32*)(vlcp->data - 3); // then read 32 bits
321  vlcp->data -= 4; // move data pointer back by 4
322  vlcp->size -= 4; // reduce available byte by 4
323  }
324  else if (vlcp->size > 0)
325  { // 4 or less
326  int i = 24;
327  while (vlcp->size > 0) {
328  ui32 v = *vlcp->data--; // read one byte at a time
329  val |= (v << i); // put byte in its correct location
330  --vlcp->size;
331  i -= 8;
332  }
333  }
334 
335  //accumulate in tmp, number of bits in tmp are stored in bits
336  ui32 tmp = val >> 24; //start with the MSB byte
337  ui32 bits;
338 
339  // test unstuff (previous byte is >0x8F), and this byte is 0x7F
340  bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
341  bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
342 
343  tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
344  bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
345  unstuff = ((val >> 16) & 0xFF) > 0x8F;
346 
347  tmp |= ((val >> 8) & 0xFF) << bits;
348  bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
349  unstuff = ((val >> 8) & 0xFF) > 0x8F;
350 
351  tmp |= (val & 0xFF) << bits;
352  bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
353  unstuff = (val & 0xFF) > 0x8F;
354 
355  // now move the read and unstuffed bits into vlcp->tmp
356  vlcp->tmp |= (ui64)tmp << vlcp->bits;
357  vlcp->bits += bits;
358  vlcp->unstuff = unstuff; // this for the next read
359  }
360 
361  //************************************************************************/
375  static inline
376  void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
377  {
378  //first byte has only the upper 4 bits
379  vlcp->data = data + lcup - 2;
380 
381  //size can not be larger than this, in fact it should be smaller
382  vlcp->size = scup - 2;
383 
384  ui32 d = *vlcp->data--; // read one byte (this is a half byte)
385  vlcp->tmp = d >> 4; // both initialize and set
386  vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
387  vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
388 
389  //This code is designed for an architecture that read address should
390  // align to the read size (address multiple of 4 if read size is 4)
391  //These few lines take care of the case where data is not at a multiple
392  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
393  // To read 32 bits, read from (vlcp->data - 3)
394  int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
395  int tnum = num < vlcp->size ? num : vlcp->size;
396  for (int i = 0; i < tnum; ++i) {
397  ui64 d;
398  d = *vlcp->data--; // read one byte and move read pointer
399  //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
400  ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
401  vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
402  vlcp->bits += d_bits;
403  vlcp->unstuff = d > 0x8F; // for next byte
404  }
405  vlcp->size -= tnum;
406  rev_read(vlcp); // read another 32 buts
407  }
408 
409  //************************************************************************/
416  static inline
418  {
419  if (vlcp->bits < 32) // if there are less then 32 bits, read more
420  {
421  rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
422  if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
423  rev_read(vlcp); // read another 32
424  }
425  return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
426  }
427 
428  //************************************************************************/
434  static inline
435  ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
436  {
437  assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
438  vlcp->tmp >>= num_bits; // remove bits
439  vlcp->bits -= num_bits; // decrement the number of bits
440  return (ui32)vlcp->tmp;
441  }
442 
443  //************************************************************************/
454  static inline
456  {
457  //process 4 bytes at a time
458  if (mrp->bits > 32)
459  return;
460  ui32 val = 0;
461  if (mrp->size > 3) // If there are 3 byte or more
462  { // (mrp->data - 3) move pointer back to read 32 bits at once
463  val = *(ui32*)(mrp->data - 3); // read 32 bits
464  mrp->data -= 4; // move back pointer
465  mrp->size -= 4; // reduce count
466  }
467  else if (mrp->size > 0)
468  {
469  int i = 24;
470  while (mrp->size > 0) {
471  ui32 v = *mrp->data--; // read one byte at a time
472  val |= (v << i); // put byte in its correct location
473  --mrp->size;
474  i -= 8;
475  }
476  }
477 
478  //accumulate in tmp, and keep count in bits
479  ui32 bits, tmp = val >> 24;
480 
481  //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
482  bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
483  bool unstuff = (val >> 24) > 0x8F;
484 
485  //process the next byte
486  tmp |= ((val >> 16) & 0xFF) << bits;
487  bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
488  unstuff = ((val >> 16) & 0xFF) > 0x8F;
489 
490  tmp |= ((val >> 8) & 0xFF) << bits;
491  bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
492  unstuff = ((val >> 8) & 0xFF) > 0x8F;
493 
494  tmp |= (val & 0xFF) << bits;
495  bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
496  unstuff = (val & 0xFF) > 0x8F;
497 
498  mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
499  mrp->bits += bits;
500  mrp->unstuff = unstuff; // next byte
501  }
502 
503  //************************************************************************/
518  static inline
519  void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
520  {
521  mrp->data = data + lcup + len2 - 1;
522  mrp->size = len2;
523  mrp->unstuff = true;
524  mrp->bits = 0;
525  mrp->tmp = 0;
526 
527  //This code is designed for an architecture that read address should
528  // align to the read size (address multiple of 4 if read size is 4)
529  //These few lines take care of the case where data is not at a multiple
530  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
531  int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
532  for (int i = 0; i < num; ++i) {
533  ui64 d;
534  //read a byte, 0 if no more data
535  d = (mrp->size-- > 0) ? *mrp->data-- : 0;
536  //check if unstuffing is needed
537  ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
538  mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
539  mrp->bits += d_bits;
540  mrp->unstuff = d > 0x8F; // for next byte
541  }
542  rev_read_mrp(mrp);
543  }
544 
545  //************************************************************************/
552  static inline
554  {
555  if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
556  {
557  rev_read_mrp(mrp); // read 30-32 bits from mrp
558  if (mrp->bits < 32) // if there is a space of 32 bits
559  rev_read_mrp(mrp); // read more
560  }
561  return (ui32)mrp->tmp; // return the head of mrp->tmp
562  }
563 
564  //************************************************************************/
570  inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
571  {
572  assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
573  mrp->tmp >>= num_bits; // discard the lowest num_bits bits
574  mrp->bits -= num_bits;
575  return (ui32)mrp->tmp; // return data after consumption
576  }
577 
578  //************************************************************************/
582  struct frwd_struct {
583  const ui8* data;
584  ui8 tmp[48];
585  ui32 bits;
586  ui32 unstuff;
587  int size;
588  };
589 
590  //************************************************************************/
608  template<int X>
609  static inline
611  {
612  assert(msp->bits <= 128);
613 
614  __m128i offset, val, validity, all_xff;
615  val = _mm_loadu_si128((__m128i*)msp->data);
616  int bytes = msp->size >= 16 ? 16 : msp->size;
617  validity = _mm_set1_epi8((char)bytes);
618  msp->data += bytes;
619  msp->size -= bytes;
620  int bits = 128;
621  offset = _mm_set_epi64x(0x0F0E0D0C0B0A0908,0x0706050403020100);
622  validity = _mm_cmpgt_epi8(validity, offset);
623  all_xff = _mm_set1_epi8(-1);
624  if (X == 0xFF) // the compiler should remove this if statement
625  {
626  __m128i t = _mm_xor_si128(validity, all_xff); // complement
627  val = _mm_or_si128(t, val); // fill with 0xFF
628  }
629  else if (X == 0)
630  val = _mm_and_si128(validity, val); // fill with zeros
631  else
632  assert(0);
633 
634  __m128i ff_bytes;
635  ff_bytes = _mm_cmpeq_epi8(val, all_xff);
636  ff_bytes = _mm_and_si128(ff_bytes, validity);
637  ui32 flags = (ui32)_mm_movemask_epi8(ff_bytes);
638  flags <<= 1; // unstuff following byte
639  ui32 next_unstuff = flags >> 16;
640  flags |= msp->unstuff;
641  flags &= 0xFFFF;
642  while (flags)
643  { // bit unstuffing occurs on average once every 256 bytes
644  // therefore it is not an issue if it is a bit slow
645  // here we process 16 bytes
646  --bits; // consuming one stuffing bit
647 
648  ui32 loc = 31 - count_leading_zeros(flags);
649  flags ^= 1 << loc;
650 
651  __m128i m, t, c;
652  t = _mm_set1_epi8((char)loc);
653  m = _mm_cmpgt_epi8(offset, t);
654 
655  t = _mm_and_si128(m, val); // keep bits at locations larger than loc
656  c = _mm_srli_epi64(t, 1); // 1 bits left
657  t = _mm_srli_si128(t, 8); // 8 bytes left
658  t = _mm_slli_epi64(t, 63); // keep the MSB only
659  t = _mm_or_si128(t, c); // combine the above 3 steps
660 
661  val = _mm_or_si128(t, _mm_andnot_si128(m, val));
662  }
663 
664  // combine with earlier data
665  assert(msp->bits >= 0 && msp->bits <= 128);
666  int cur_bytes = msp->bits >> 3;
667  int cur_bits = msp->bits & 7;
668  __m128i b1, b2;
669  b1 = _mm_sll_epi64(val, _mm_set1_epi64x(cur_bits));
670  b2 = _mm_slli_si128(val, 8); // 8 bytes right
671  b2 = _mm_srl_epi64(b2, _mm_set1_epi64x(64-cur_bits));
672  b1 = _mm_or_si128(b1, b2);
673  b2 = _mm_loadu_si128((__m128i*)(msp->tmp + cur_bytes));
674  b2 = _mm_or_si128(b1, b2);
675  _mm_storeu_si128((__m128i*)(msp->tmp + cur_bytes), b2);
676 
677  int consumed_bits = bits < 128 - cur_bits ? bits : 128 - cur_bits;
678  cur_bytes = (msp->bits + (ui32)consumed_bits + 7) >> 3; // round up
679  int upper = _mm_extract_epi16(val, 7);
680  upper >>= consumed_bits - 128 + 16;
681  msp->tmp[cur_bytes] = (ui8)upper; // copy byte
682 
683  msp->bits += (ui32)bits;
684  msp->unstuff = next_unstuff; // next unstuff
685  assert(msp->unstuff == 0 || msp->unstuff == 1);
686  }
687 
688  //************************************************************************/
697  template<int X>
698  static inline
699  void frwd_init(frwd_struct *msp, const ui8* data, int size)
700  {
701  msp->data = data;
702  _mm_storeu_si128((__m128i *)msp->tmp, _mm_setzero_si128());
703  _mm_storeu_si128((__m128i *)msp->tmp + 1, _mm_setzero_si128());
704  _mm_storeu_si128((__m128i *)msp->tmp + 2, _mm_setzero_si128());
705 
706  msp->bits = 0;
707  msp->unstuff = 0;
708  msp->size = size;
709 
710  frwd_read<X>(msp); // read 128 bits more
711  }
712 
713  //************************************************************************/
719  static inline
720  void frwd_advance(frwd_struct *msp, ui32 num_bits)
721  {
722  assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
723  msp->bits -= num_bits;
724 
725  __m128i *p = (__m128i*)(msp->tmp + ((num_bits >> 3) & 0x18));
726  num_bits &= 63;
727 
728  __m128i v0, v1, c0, c1, t;
729  v0 = _mm_loadu_si128(p);
730  v1 = _mm_loadu_si128(p + 1);
731 
732  // shift right by num_bits
733  c0 = _mm_srl_epi64(v0, _mm_set1_epi64x(num_bits));
734  t = _mm_srli_si128(v0, 8);
735  t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
736  c0 = _mm_or_si128(c0, t);
737  t = _mm_slli_si128(v1, 8);
738  t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
739  c0 = _mm_or_si128(c0, t);
740 
741  _mm_storeu_si128((__m128i*)msp->tmp, c0);
742 
743  c1 = _mm_srl_epi64(v1, _mm_set1_epi64x(num_bits));
744  t = _mm_srli_si128(v1, 8);
745  t = _mm_sll_epi64(t, _mm_set1_epi64x(64 - num_bits));
746  c1 = _mm_or_si128(c1, t);
747 
748  _mm_storeu_si128((__m128i*)msp->tmp + 1, c1);
749  }
750 
751  //************************************************************************/
758  template<int X>
759  static inline
760  __m128i frwd_fetch(frwd_struct *msp)
761  {
762  if (msp->bits <= 128)
763  {
764  frwd_read<X>(msp);
765  if (msp->bits <= 128) //need to test
766  frwd_read<X>(msp);
767  }
768  __m128i t = _mm_loadu_si128((__m128i*)msp->tmp);
769  return t;
770  }
771 
772  //************************************************************************/
784  template <int N>
785  static inline
786  __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q,
787  frwd_struct* magsgn, ui32 p, __m128i& vn)
788  {
789  __m128i w0; // workers
790  __m128i insig; // lanes hold FF's if samples are insignificant
791  __m128i flags; // lanes hold e_k, e_1, and rho
792  __m128i row; // decoded row
793 
794  row = _mm_setzero_si128();
795  w0 = _mm_shuffle_epi32(inf_u_q, _MM_SHUFFLE(N, N, N, N));
796  // we keeps e_k, e_1, and rho in w2
797  flags = _mm_and_si128(w0, _mm_set_epi32(0x8880, 0x4440, 0x2220, 0x1110));
798  insig = _mm_cmpeq_epi32(flags, _mm_setzero_si128());
799  if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
800  {
801  U_q = _mm_shuffle_epi32(U_q, _MM_SHUFFLE(N, N, N, N));
802  flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,1,2,2,4,4,8,8));
803  __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
804 
805  // U_q holds U_q for this quad
806  // flags has e_k, e_1, and rho such that e_k is sitting in the
807  // 0x8000, e_1 in 0x800, and rho in 0x80
808 
809  // next e_k and m_n
810  __m128i m_n;
811  w0 = _mm_srli_epi32(flags, 15); // e_k
812  m_n = _mm_sub_epi32(U_q, w0);
813  m_n = _mm_andnot_si128(insig, m_n);
814 
815  // find cumulative sums
816  // to find at which bit in ms_vec the sample starts
817  __m128i inc_sum = m_n; // inclusive scan
818  inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 4));
819  inc_sum = _mm_add_epi32(inc_sum, _mm_bslli_si128(inc_sum, 8));
820  int total_mn = _mm_extract_epi16(inc_sum, 6);
821  __m128i ex_sum = _mm_bslli_si128(inc_sum, 4); // exclusive scan
822 
823  // find the starting byte and starting bit
824  __m128i byte_idx = _mm_srli_epi32(ex_sum, 3);
825  __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi32(7));
826  byte_idx = _mm_shuffle_epi8(byte_idx,
827  _mm_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
828  byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x03020100));
829  __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
830  byte_idx = _mm_add_epi32(byte_idx, _mm_set1_epi32(0x01010101));
831  __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
832 
833  // shift samples values to correct location
834  bit_idx = _mm_or_si128(bit_idx, _mm_slli_epi32(bit_idx, 16));
835  __m128i bit_shift = _mm_shuffle_epi8(
836  _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
837  1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
838  bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
839  d0 = _mm_mullo_epi16(d0, bit_shift);
840  d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
841  d1 = _mm_mullo_epi16(d1, bit_shift);
842  d1 = _mm_and_si128(d1, _mm_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
843  d0 = _mm_or_si128(d0, d1);
844 
845  // find location of e_k and mask
846  __m128i shift;
847  __m128i ones = _mm_set1_epi32(1);
848  __m128i twos = _mm_set1_epi32(2);
849  __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
850  U_q_m1 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
851  w0 = _mm_sub_epi32(twos, w0);
852  shift = _mm_sll_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
853  ms_vec = _mm_and_si128(d0, _mm_sub_epi32(shift, ones));
854 
855  // next e_1
856  w0 = _mm_and_si128(flags, _mm_set1_epi32(0x800));
857  w0 = _mm_cmpeq_epi32(w0, _mm_setzero_si128());
858  w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
859  ms_vec = _mm_or_si128(ms_vec, w0); // e_1
860  w0 = _mm_slli_epi32(ms_vec, 31); // sign
861  ms_vec = _mm_or_si128(ms_vec, ones); // bin center
862  __m128i tvn = ms_vec;
863  ms_vec = _mm_add_epi32(ms_vec, twos);// + 2
864  ms_vec = _mm_slli_epi32(ms_vec, (si32)p - 1);
865  ms_vec = _mm_or_si128(ms_vec, w0); // sign
866  row = _mm_andnot_si128(insig, ms_vec); // significant only
867 
868  ms_vec = _mm_andnot_si128(insig, tvn); // significant only
869  if (N == 0) // the compiler should remove one
870  tvn = _mm_shuffle_epi8(ms_vec,
871  _mm_set_epi32(-1, -1, 0x0F0E0D0C, 0x07060504));
872  else if (N == 1)
873  tvn = _mm_shuffle_epi8(ms_vec,
874  _mm_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1));
875  else
876  assert(0);
877  vn = _mm_or_si128(vn, tvn);
878 
879  if (total_mn)
880  frwd_advance(magsgn, (ui32)total_mn);
881  }
882  return row;
883  }
884 
885  //************************************************************************/
895  static inline
896  __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q,
897  frwd_struct* magsgn, ui32 p, __m128i& vn)
898  {
899  __m128i w0; // workers
900  __m128i insig; // lanes hold FF's if samples are insignificant
901  __m128i flags; // lanes hold e_k, e_1, and rho
902  __m128i row; // decoded row
903 
904  row = _mm_setzero_si128();
905  w0 = _mm_shuffle_epi8(inf_u_q,
906  _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
907  0x0100, 0x0100, 0x0100, 0x0100));
908  // we keeps e_k, e_1, and rho in w2
909  flags = _mm_and_si128(w0,
910  _mm_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
911  (si16)0x8880, 0x4440, 0x2220, 0x1110));
912  insig = _mm_cmpeq_epi16(flags, _mm_setzero_si128());
913  if (_mm_movemask_epi8(insig) != 0xFFFF) //are all insignificant?
914  {
915  U_q = _mm_shuffle_epi8(U_q,
916  _mm_set_epi16(0x0504, 0x0504, 0x0504, 0x0504,
917  0x0100, 0x0100, 0x0100, 0x0100));
918  flags = _mm_mullo_epi16(flags, _mm_set_epi16(1,2,4,8,1,2,4,8));
919  __m128i ms_vec = frwd_fetch<0xFF>(magsgn);
920 
921  // U_q holds U_q for this quad
922  // flags has e_k, e_1, and rho such that e_k is sitting in the
923  // 0x8000, e_1 in 0x800, and rho in 0x80
924 
925  // next e_k and m_n
926  __m128i m_n;
927  w0 = _mm_srli_epi16(flags, 15); // e_k
928  m_n = _mm_sub_epi16(U_q, w0);
929  m_n = _mm_andnot_si128(insig, m_n);
930 
931  // find cumulative sums
932  // to find at which bit in ms_vec the sample starts
933  __m128i inc_sum = m_n; // inclusive scan
934  inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 2));
935  inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 4));
936  inc_sum = _mm_add_epi16(inc_sum, _mm_bslli_si128(inc_sum, 8));
937  int total_mn = _mm_extract_epi16(inc_sum, 7);
938  __m128i ex_sum = _mm_bslli_si128(inc_sum, 2); // exclusive scan
939 
940  // find the starting byte and starting bit
941  __m128i byte_idx = _mm_srli_epi16(ex_sum, 3);
942  __m128i bit_idx = _mm_and_si128(ex_sum, _mm_set1_epi16(7));
943  byte_idx = _mm_shuffle_epi8(byte_idx,
944  _mm_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
945  0x0606, 0x0404, 0x0202, 0x0000));
946  byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0100));
947  __m128i d0 = _mm_shuffle_epi8(ms_vec, byte_idx);
948  byte_idx = _mm_add_epi16(byte_idx, _mm_set1_epi16(0x0101));
949  __m128i d1 = _mm_shuffle_epi8(ms_vec, byte_idx);
950 
951  // shift samples values to correct location
952  __m128i bit_shift = _mm_shuffle_epi8(
953  _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
954  1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
955  bit_shift = _mm_add_epi16(bit_shift, _mm_set1_epi16(0x0101));
956  d0 = _mm_mullo_epi16(d0, bit_shift);
957  d0 = _mm_srli_epi16(d0, 8); // we should have 8 bits in the LSB
958  d1 = _mm_mullo_epi16(d1, bit_shift);
959  d1 = _mm_and_si128(d1, _mm_set1_epi16((si16)0xFF00)); // 8 in MSB
960  d0 = _mm_or_si128(d0, d1);
961 
962  // find location of e_k and mask
963  __m128i shift, t0, t1, Uq0, Uq1;
964  __m128i ones = _mm_set1_epi16(1);
965  __m128i twos = _mm_set1_epi16(2);
966  __m128i U_q_m1 = _mm_sub_epi32(U_q, ones);
967  Uq0 = _mm_and_si128(U_q_m1, _mm_set_epi32(0,0,0,0x1F));
968  Uq1 = _mm_bsrli_si128(U_q_m1, 14);
969  w0 = _mm_sub_epi16(twos, w0);
970  t0 = _mm_and_si128(w0, _mm_set_epi64x(0, -1));
971  t1 = _mm_and_si128(w0, _mm_set_epi64x(-1, 0));
972  t0 = _mm_sll_epi16(t0, Uq0);
973  t1 = _mm_sll_epi16(t1, Uq1);
974  shift = _mm_or_si128(t0, t1);
975  ms_vec = _mm_and_si128(d0, _mm_sub_epi16(shift, ones));
976 
977  // next e_1
978  w0 = _mm_and_si128(flags, _mm_set1_epi16(0x800));
979  w0 = _mm_cmpeq_epi16(w0, _mm_setzero_si128());
980  w0 = _mm_andnot_si128(w0, shift); // e_1 in correct position
981  ms_vec = _mm_or_si128(ms_vec, w0); // e_1
982  w0 = _mm_slli_epi16(ms_vec, 15); // sign
983  ms_vec = _mm_or_si128(ms_vec, ones); // bin center
984  __m128i tvn = ms_vec;
985  ms_vec = _mm_add_epi16(ms_vec, twos);// + 2
986  ms_vec = _mm_slli_epi16(ms_vec, (si32)p - 1);
987  ms_vec = _mm_or_si128(ms_vec, w0); // sign
988  row = _mm_andnot_si128(insig, ms_vec); // significant only
989 
990  ms_vec = _mm_andnot_si128(insig, tvn); // significant only
991  w0 = _mm_shuffle_epi8(ms_vec,
992  _mm_set_epi16(-1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
993  vn = _mm_or_si128(vn, w0);
994  w0 = _mm_shuffle_epi8(ms_vec,
995  _mm_set_epi16(-1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
996  vn = _mm_or_si128(vn, w0);
997 
998  if (total_mn)
999  frwd_advance(magsgn, (ui32)total_mn);
1000  }
1001  return row;
1002  }
1003 
1004 
1005  //************************************************************************/
1022  bool ojph_decode_codeblock_ssse3(ui8* coded_data, ui32* decoded_data,
1023  ui32 missing_msbs, ui32 num_passes,
1024  ui32 lengths1, ui32 lengths2,
1025  ui32 width, ui32 height, ui32 stride,
1026  bool stripe_causal)
1027  {
1028  static bool insufficient_precision = false;
1029  static bool modify_code = false;
1030  static bool truncate_spp_mrp = false;
1031 
1032  if (num_passes > 1 && lengths2 == 0)
1033  {
1034  OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1035  "one coding pass, but zero length for "
1036  "2nd and potential 3rd pass.\n");
1037  num_passes = 1;
1038  }
1039 
1040  if (num_passes > 3)
1041  {
1042  OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1043  "This codeblocks has %d passes.\n",
1044  num_passes);
1045  return false;
1046  }
1047 
1048  if (missing_msbs > 30) // p < 0
1049  {
1050  if (insufficient_precision == false)
1051  {
1052  insufficient_precision = true;
1053  OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1054  "codeblock. This message will not be "
1055  "displayed again.\n");
1056  }
1057  return false;
1058  }
1059  else if (missing_msbs == 30) // p == 0
1060  { // not enough precision to decode and set the bin center to 1
1061  if (modify_code == false) {
1062  modify_code = true;
1063  OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1064  "pass. The code can be modified to support "
1065  "this case. This message will not be "
1066  "displayed again.\n");
1067  }
1068  return false; // 32 bits are not enough to decode this
1069  }
1070  else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1071  {
1072  if (num_passes > 1) {
1073  num_passes = 1;
1074  if (truncate_spp_mrp == false) {
1075  truncate_spp_mrp = true;
1076  OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1077  "nor MagRef passes; both will be skipped. "
1078  "This message will not be displayed "
1079  "again.\n");
1080  }
1081  }
1082  }
1083  ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1084  // There is a way to handle the case of p == 0, but a different path
1085  // is required
1086 
1087  if (lengths1 < 2)
1088  {
1089  OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
1090  return false;
1091  }
1092 
1093  // read scup and fix the bytes there
1094  int lcup, scup;
1095  lcup = (int)lengths1; // length of CUP
1096  //scup is the length of MEL + VLC
1097  scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1098  if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1099  return false;
1100 
1101  // The temporary storage scratch holds two types of data in an
1102  // interleaved fashion. The interleaving allows us to use one
1103  // memory pointer.
1104  // We have one entry for a decoded VLC code, and one entry for UVLC.
1105  // Entries are 16 bits each, corresponding to one quad,
1106  // but since we want to use XMM registers of the SSE family
1107  // of SIMD; we allocated 16 bytes or more per quad row; that is,
1108  // the width is no smaller than 16 bytes (or 8 entries), and the
1109  // height is 512 quads
1110  // Each VLC entry contains, in the following order, starting
1111  // from MSB
1112  // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1113  // Each entry in UVLC contains u_q
1114  // One extra row to handle the case of SPP propagating downwards
1115  // when codeblock width is 4
1116  ui16 scratch[8 * 513] = {0}; // 8+ kB
1117 
1118  // We need an extra two entries (one inf and one u_q) beyond
1119  // the last column.
1120  // If the block width is 4 (2 quads), then we use sstr of 8
1121  // (enough for 4 quads). If width is 8 (4 quads) we use
1122  // sstr is 16 (enough for 8 quads). For a width of 16 (8
1123  // quads), we use 24 (enough for 12 quads).
1124  ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1125 
1126  assert((stride & 0x3) == 0);
1127 
1128  ui32 mmsbp2 = missing_msbs + 2;
1129 
1130  // The cleanup pass is decoded in two steps; in step one,
1131  // the VLC and MEL segments are decoded, generating a record that
1132  // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1133  // This information should be sufficient for the next step.
1134  // In step 2, we decode the MagSgn segment.
1135 
1136  // step 1 decoding VLC and MEL segments
1137  {
1138  // init structures
1139  dec_mel_st mel;
1140  mel_init(&mel, coded_data, lcup, scup);
1141  rev_struct vlc;
1142  rev_init(&vlc, coded_data, lcup, scup);
1143 
1144  int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1145  // data represented as runs of 0 events
1146  // See mel_decode description
1147 
1148  ui32 vlc_val;
1149  ui32 c_q = 0;
1150  ui16 *sp = scratch;
1151  //initial quad row
1152  for (ui32 x = 0; x < width; sp += 4)
1153  {
1154  // decode VLC
1156 
1157  // first quad
1158  vlc_val = rev_fetch(&vlc);
1159 
1160  //decode VLC using the context c_q and the head of VLC bitstream
1161  ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1162 
1163  // if context is zero, use one MEL event
1164  if (c_q == 0) //zero context
1165  {
1166  run -= 2; //subtract 2, since events number if multiplied by 2
1167 
1168  // Is the run terminated in 1? if so, use decoded VLC code,
1169  // otherwise, discard decoded data, since we will decoded again
1170  // using a different context
1171  t0 = (run == -1) ? t0 : 0;
1172 
1173  // is run -1 or -2? this means a run has been consumed
1174  if (run < 0)
1175  run = mel_get_run(&mel); // get another run
1176  }
1177  //run -= (c_q == 0) ? 2 : 0;
1178  //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1179  //if (run < 0)
1180  // run = mel_get_run(&mel); // get another run
1181  sp[0] = t0;
1182  x += 2;
1183 
1184  // prepare context for the next quad; eqn. 1 in ITU T.814
1185  c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1186 
1187  //remove data from vlc stream (0 bits are removed if vlc is not used)
1188  vlc_val = rev_advance(&vlc, t0 & 0x7);
1189 
1190  //second quad
1191  ui16 t1 = 0;
1192 
1193  //decode VLC using the context c_q and the head of VLC bitstream
1194  t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1195 
1196  // if context is zero, use one MEL event
1197  if (c_q == 0 && x < width) //zero context
1198  {
1199  run -= 2; //subtract 2, since events number if multiplied by 2
1200 
1201  // if event is 0, discard decoded t1
1202  t1 = (run == -1) ? t1 : 0;
1203 
1204  if (run < 0) // have we consumed all events in a run
1205  run = mel_get_run(&mel); // if yes, then get another run
1206  }
1207  t1 = x < width ? t1 : 0;
1208  //run -= (c_q == 0 && x < width) ? 2 : 0;
1209  //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1210  //if (run < 0)
1211  // run = mel_get_run(&mel); // get another run
1212  sp[2] = t1;
1213  x += 2;
1214 
1215  //prepare context for the next quad, eqn. 1 in ITU T.814
1216  c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1217 
1218  //remove data from vlc stream, if qinf is not used, cwdlen is 0
1219  vlc_val = rev_advance(&vlc, t1 & 0x7);
1220 
1221  // decode u
1223  // uvlc_mode is made up of u_offset bits from the quad pair
1224  ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1225  if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1226  { // the MEL run of events
1227  run -= 2; //subtract 2, since events number if multiplied by 2
1228 
1229  uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1230  // is 0x40
1231 
1232  if (run < 0)//if run is consumed (run is -1 or -2), get another run
1233  run = mel_get_run(&mel);
1234  }
1235  //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1236  //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1237  //if (run < 0)
1238  // run = mel_get_run(&mel); // get another run
1239 
1240  //decode uvlc_mode to get u for both quads
1241  ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1242  //remove total prefix length
1243  vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1244  uvlc_entry >>= 3;
1245  //extract suffixes for quad 0 and 1
1246  ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1247  ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1248  vlc_val = rev_advance(&vlc, len);
1249  uvlc_entry >>= 4;
1250  // quad 0 length
1251  len = uvlc_entry & 0x7; // quad 0 suffix length
1252  uvlc_entry >>= 3;
1253  ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1254  sp[1] = u_q;
1255  u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1256  sp[3] = u_q;
1257  }
1258  sp[0] = sp[1] = 0;
1259 
1260  //non initial quad rows
1261  for (ui32 y = 2; y < height; y += 2)
1262  {
1263  c_q = 0; // context
1264  ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1265 
1266  for (ui32 x = 0; x < width; sp += 4)
1267  {
1268  // decode VLC
1270 
1271  // sigma_q (n, ne, nf)
1272  c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1273  c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1274 
1275  // first quad
1276  vlc_val = rev_fetch(&vlc);
1277 
1278  //decode VLC using the context c_q and the head of VLC bitstream
1279  ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1280 
1281  // if context is zero, use one MEL event
1282  if (c_q == 0) //zero context
1283  {
1284  run -= 2; //subtract 2, since events number is multiplied by 2
1285 
1286  // Is the run terminated in 1? if so, use decoded VLC code,
1287  // otherwise, discard decoded data, since we will decoded again
1288  // using a different context
1289  t0 = (run == -1) ? t0 : 0;
1290 
1291  // is run -1 or -2? this means a run has been consumed
1292  if (run < 0)
1293  run = mel_get_run(&mel); // get another run
1294  }
1295  //run -= (c_q == 0) ? 2 : 0;
1296  //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1297  //if (run < 0)
1298  // run = mel_get_run(&mel); // get another run
1299  sp[0] = t0;
1300  x += 2;
1301 
1302  // prepare context for the next quad; eqn. 2 in ITU T.814
1303  // sigma_q (w, sw)
1304  c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1305  // sigma_q (nw)
1306  c_q |= sp[0 - (si32)sstr] & 0x80;
1307  // sigma_q (n, ne, nf)
1308  c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1309  c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1310 
1311  //remove data from vlc stream (0 bits are removed if vlc is unused)
1312  vlc_val = rev_advance(&vlc, t0 & 0x7);
1313 
1314  //second quad
1315  ui16 t1 = 0;
1316 
1317  //decode VLC using the context c_q and the head of VLC bitstream
1318  t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1319 
1320  // if context is zero, use one MEL event
1321  if (c_q == 0 && x < width) //zero context
1322  {
1323  run -= 2; //subtract 2, since events number if multiplied by 2
1324 
1325  // if event is 0, discard decoded t1
1326  t1 = (run == -1) ? t1 : 0;
1327 
1328  if (run < 0) // have we consumed all events in a run
1329  run = mel_get_run(&mel); // if yes, then get another run
1330  }
1331  t1 = x < width ? t1 : 0;
1332  //run -= (c_q == 0 && x < width) ? 2 : 0;
1333  //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1334  //if (run < 0)
1335  // run = mel_get_run(&mel); // get another run
1336  sp[2] = t1;
1337  x += 2;
1338 
1339  // partial c_q, will be completed when we process the next quad
1340  // sigma_q (w, sw)
1341  c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1342  // sigma_q (nw)
1343  c_q |= sp[2 - (si32)sstr] & 0x80;
1344 
1345  //remove data from vlc stream, if qinf is not used, cwdlen is 0
1346  vlc_val = rev_advance(&vlc, t1 & 0x7);
1347 
1348  // decode u
1350  // uvlc_mode is made up of u_offset bits from the quad pair
1351  ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1352  ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1353  //remove total prefix length
1354  vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1355  uvlc_entry >>= 3;
1356  //extract suffixes for quad 0 and 1
1357  ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1358  ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1359  vlc_val = rev_advance(&vlc, len);
1360  uvlc_entry >>= 4;
1361  // quad 0 length
1362  len = uvlc_entry & 0x7; // quad 0 suffix length
1363  uvlc_entry >>= 3;
1364  ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
1365  sp[1] = u_q;
1366  u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1367  sp[3] = u_q;
1368  }
1369  sp[0] = sp[1] = 0;
1370  }
1371  }
1372 
1373  // step2 we decode magsgn
1374  // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1375  // The 32 bit path decode 16 bits data, for which one would think
1376  // 16 bits are enough, because we want to put in the center of the
1377  // bin.
1378  // If you have mmsbp2 equals 16 bit, and reversible coding, and
1379  // no bitplanes are missing, then we can decoding using the 16 bit
1380  // path, but we are not doing this here.
1381  if (mmsbp2 >= 16)
1382  {
1383  // We allocate a scratch row for storing v_n values.
1384  // We have 512 quads horizontally.
1385  // We may go beyond the last entry by up to 4 entries.
1386  // Here we allocate additional 8 entries.
1387  // There are two rows in this structure, the bottom
1388  // row is used to store processed entries.
1389  const int v_n_size = 512 + 8;
1390  ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1391 
1392  frwd_struct magsgn;
1393  frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1394 
1395  {
1396  ui16 *sp = scratch;
1397  ui32 *vp = v_n_scratch;
1398  ui32 *dp = decoded_data;
1399  vp[0] = 2; // for easy calculation of emax
1400 
1401  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1402  {
1403  //here we process two quads
1404  __m128i w0, w1; // workers
1405  __m128i inf_u_q, U_q;
1406  // determine U_q
1407  {
1408  inf_u_q = _mm_loadu_si128((__m128i*)sp);
1409  U_q = _mm_srli_epi32(inf_u_q, 16);
1410 
1411  w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1412  int i = _mm_movemask_epi8(w0);
1413  if (i & 0xFF) // only the lower two U_q
1414  return false;
1415  }
1416 
1417  __m128i vn = _mm_set1_epi32(2);
1418  __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1419  __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1420  w0 = _mm_loadu_si128((__m128i*)vp);
1421  w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1422  w0 = _mm_or_si128(w0, vn);
1423  _mm_storeu_si128((__m128i*)vp, w0);
1424 
1425  //interleave in ssse3 style
1426  w0 = _mm_unpacklo_epi32(row0, row1);
1427  w1 = _mm_unpackhi_epi32(row0, row1);
1428  row0 = _mm_unpacklo_epi32(w0, w1);
1429  row1 = _mm_unpackhi_epi32(w0, w1);
1430  _mm_store_si128((__m128i*)dp, row0);
1431  _mm_store_si128((__m128i*)(dp + stride), row1);
1432  }
1433  }
1434 
1435  for (ui32 y = 2; y < height; y += 2)
1436  {
1437  {
1438  // perform 31 - count_leading_zeros(*vp) here
1439  ui32 *vp = v_n_scratch;
1440  const __m128i lut_lo = _mm_set_epi8(
1441  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 31
1442  );
1443  const __m128i lut_hi = _mm_set_epi8(
1444  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 31
1445  );
1446  const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1447  const __m128i byte_offset8 = _mm_set1_epi16(8);
1448  const __m128i byte_offset16 = _mm_set1_epi16(16);
1449  const __m128i cc = _mm_set1_epi32(31);
1450  for (ui32 x = 0; x <= width; x += 8, vp += 4)
1451  {
1452  __m128i v, t; // workers
1453  v = _mm_loadu_si128((__m128i*)vp);
1454 
1455  t = _mm_and_si128(nibble_mask, v);
1456  v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1457  t = _mm_shuffle_epi8(lut_lo, t);
1458  v = _mm_shuffle_epi8(lut_hi, v);
1459  v = _mm_min_epu8(v, t);
1460 
1461  t = _mm_srli_epi16(v, 8);
1462  v = _mm_or_si128(v, byte_offset8);
1463  v = _mm_min_epu8(v, t);
1464 
1465  t = _mm_srli_epi32(v, 16);
1466  v = _mm_or_si128(v, byte_offset16);
1467  v = _mm_min_epu8(v, t);
1468 
1469  v = _mm_sub_epi16(cc, v);
1470  _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1471  }
1472  }
1473 
1474  ui32 *vp = v_n_scratch;
1475  ui16 *sp = scratch + (y >> 1) * sstr;
1476  ui32 *dp = decoded_data + y * stride;
1477  vp[0] = 2; // for easy calculation of emax
1478 
1479  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1480  {
1481  //process two quads
1482  __m128i w0, w1; // workers
1483  __m128i inf_u_q, U_q;
1484  // determine U_q
1485  {
1486  __m128i gamma, emax, kappa, u_q; // needed locally
1487 
1488  inf_u_q = _mm_loadu_si128((__m128i*)sp);
1489  gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1490  w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1491  gamma = _mm_and_si128(gamma, w0);
1492  gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1493 
1494  emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1495  w0 = _mm_bsrli_si128(emax, 4);
1496  emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1497  emax = _mm_andnot_si128(gamma, emax);
1498 
1499  kappa = _mm_set1_epi32(1);
1500  kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1501 
1502  u_q = _mm_srli_epi32(inf_u_q, 16);
1503  U_q = _mm_add_epi32(u_q, kappa);
1504 
1505  w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1506  int i = _mm_movemask_epi8(w0);
1507  if (i & 0xFF) // only the lower two U_q
1508  return false;
1509  }
1510 
1511  __m128i vn = _mm_set1_epi32(2);
1512  __m128i row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1513  __m128i row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1514  w0 = _mm_loadu_si128((__m128i*)vp);
1515  w0 = _mm_and_si128(w0, _mm_set_epi32(0,0,0,-1));
1516  w0 = _mm_or_si128(w0, vn);
1517  _mm_storeu_si128((__m128i*)vp, w0);
1518 
1519  //interleave in ssse3 style
1520  w0 = _mm_unpacklo_epi32(row0, row1);
1521  w1 = _mm_unpackhi_epi32(row0, row1);
1522  row0 = _mm_unpacklo_epi32(w0, w1);
1523  row1 = _mm_unpackhi_epi32(w0, w1);
1524  _mm_store_si128((__m128i*)dp, row0);
1525  _mm_store_si128((__m128i*)(dp + stride), row1);
1526  }
1527  }
1528  }
1529  else
1530  {
1531  // reduce bitplane by 16 because we now have 16 bits instead of 32
1532  p -= 16;
1533 
1534  // We allocate a scratch row for storing v_n values.
1535  // We have 512 quads horizontally.
1536  // We may go beyond the last entry by up to 8 entries.
1537  // Therefore we allocate additional 8 entries.
1538  // There are two rows in this structure, the bottom
1539  // row is used to store processed entries.
1540  const int v_n_size = 512 + 8;
1541  ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1542 
1543  frwd_struct magsgn;
1544  frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1545 
1546  {
1547  ui16 *sp = scratch;
1548  ui16 *vp = v_n_scratch;
1549  ui32 *dp = decoded_data;
1550  vp[0] = 2; // for easy calculation of emax
1551 
1552  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1553  {
1554  //here we process two quads
1555  __m128i w0, w1; // workers
1556  __m128i inf_u_q, U_q;
1557  // determine U_q
1558  {
1559  inf_u_q = _mm_loadu_si128((__m128i*)sp);
1560  U_q = _mm_srli_epi32(inf_u_q, 16);
1561 
1562  w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1563  int i = _mm_movemask_epi8(w0);
1564  if (i & 0xFF) // only the lower two U_q
1565  return false;
1566  }
1567 
1568  __m128i vn = _mm_set1_epi16(2);
1569  __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1570  w0 = _mm_loadu_si128((__m128i*)vp);
1571  w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1572  w0 = _mm_or_si128(w0, vn);
1573  _mm_storeu_si128((__m128i*)vp, w0);
1574 
1575  //interleave in ssse3 style
1576  w0 = _mm_shuffle_epi8(row,
1577  _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1578  0x0504, -1, 0x0100, -1));
1579  _mm_store_si128((__m128i*)dp, w0);
1580  w1 = _mm_shuffle_epi8(row,
1581  _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1582  0x0706, -1, 0x0302, -1));
1583  _mm_store_si128((__m128i*)(dp + stride), w1);
1584  }
1585  }
1586 
1587  for (ui32 y = 2; y < height; y += 2)
1588  {
1589  {
1590  // perform 15 - count_leading_zeros(*vp) here
1591  ui16 *vp = v_n_scratch;
1592  const __m128i lut_lo = _mm_set_epi8(
1593  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 15
1594  );
1595  const __m128i lut_hi = _mm_set_epi8(
1596  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 15
1597  );
1598  const __m128i nibble_mask = _mm_set1_epi8(0x0F);
1599  const __m128i byte_offset8 = _mm_set1_epi16(8);
1600  const __m128i cc = _mm_set1_epi16(15);
1601  for (ui32 x = 0; x <= width; x += 16, vp += 8)
1602  {
1603  __m128i v, t; // workers
1604  v = _mm_loadu_si128((__m128i*)vp);
1605 
1606  t = _mm_and_si128(nibble_mask, v);
1607  v = _mm_and_si128(_mm_srli_epi16(v, 4), nibble_mask);
1608  t = _mm_shuffle_epi8(lut_lo, t);
1609  v = _mm_shuffle_epi8(lut_hi, v);
1610  v = _mm_min_epu8(v, t);
1611 
1612  t = _mm_srli_epi16(v, 8);
1613  v = _mm_or_si128(v, byte_offset8);
1614  v = _mm_min_epu8(v, t);
1615 
1616  v = _mm_sub_epi16(cc, v);
1617  _mm_storeu_si128((__m128i*)(vp + v_n_size), v);
1618  }
1619  }
1620 
1621  ui16 *vp = v_n_scratch;
1622  ui16 *sp = scratch + (y >> 1) * sstr;
1623  ui32 *dp = decoded_data + y * stride;
1624  vp[0] = 2; // for easy calculation of emax
1625 
1626  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1627  {
1628  //process two quads
1629  __m128i w0, w1; // workers
1630  __m128i inf_u_q, U_q;
1631  // determine U_q
1632  {
1633  __m128i gamma, emax, kappa, u_q; // needed locally
1634 
1635  inf_u_q = _mm_loadu_si128((__m128i*)sp);
1636  gamma = _mm_and_si128(inf_u_q, _mm_set1_epi32(0xF0));
1637  w0 = _mm_sub_epi32(gamma, _mm_set1_epi32(1));
1638  gamma = _mm_and_si128(gamma, w0);
1639  gamma = _mm_cmpeq_epi32(gamma, _mm_setzero_si128());
1640 
1641  emax = _mm_loadu_si128((__m128i*)(vp + v_n_size));
1642  w0 = _mm_bsrli_si128(emax, 2);
1643  emax = _mm_max_epi16(w0, emax); // no max_epi32 in ssse3
1644  emax = _mm_shuffle_epi8(emax,
1645  _mm_set_epi16(-1, 0x0706, -1, 0x0504,
1646  -1, 0x0302, -1, 0x0100));
1647  emax = _mm_andnot_si128(gamma, emax);
1648 
1649  kappa = _mm_set1_epi32(1);
1650  kappa = _mm_max_epi16(emax, kappa); // no max_epi32 in ssse3
1651 
1652  u_q = _mm_srli_epi32(inf_u_q, 16);
1653  U_q = _mm_add_epi32(u_q, kappa);
1654 
1655  w0 = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
1656  int i = _mm_movemask_epi8(w0);
1657  if (i & 0xFF) // only the lower two U_q
1658  return false;
1659  }
1660 
1661  __m128i vn = _mm_set1_epi16(2);
1662  __m128i row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1663  w0 = _mm_loadu_si128((__m128i*)vp);
1664  w0 = _mm_and_si128(w0, _mm_set_epi16(0,0,0,0,0,0,0,-1));
1665  w0 = _mm_or_si128(w0, vn);
1666  _mm_storeu_si128((__m128i*)vp, w0);
1667 
1668  w0 = _mm_shuffle_epi8(row,
1669  _mm_set_epi16(0x0D0C, -1, 0x0908, -1,
1670  0x0504, -1, 0x0100, -1));
1671  _mm_store_si128((__m128i*)dp, w0);
1672  w1 = _mm_shuffle_epi8(row,
1673  _mm_set_epi16(0x0F0E, -1, 0x0B0A, -1,
1674  0x0706, -1, 0x0302, -1));
1675  _mm_store_si128((__m128i*)(dp + stride), w1);
1676  }
1677  }
1678 
1679  // increase bitplane back by 16 because we need to process 32 bits
1680  p += 16;
1681  }
1682 
1683  if (num_passes > 1)
1684  {
1685  // We use scratch again, we can divide it into multiple regions
1686  // sigma holds all the significant samples, and it cannot
1687  // be modified after it is set. it will be used during the
1688  // Magnitude Refinement Pass
1689  ui16* const sigma = scratch;
1690 
1691  ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1692  // ui16 contains 4 columns
1693  mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1694 
1695  // We re-arrange quad significance, where each 4 consecutive
1696  // bits represent one quad, into column significance, where,
1697  // each 4 consequtive bits represent one column of 4 rows
1698  {
1699  ui32 y;
1700 
1701  const __m128i mask_3 = _mm_set1_epi32(0x30);
1702  const __m128i mask_C = _mm_set1_epi32(0xC0);
1703  const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1704  for (y = 0; y < height; y += 4)
1705  {
1706  ui16* sp = scratch + (y >> 1) * sstr;
1707  ui16* dp = sigma + (y >> 2) * mstr;
1708  for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1709  {
1710  __m128i s0, s1, u3, uC, t0, t1;
1711 
1712  s0 = _mm_loadu_si128((__m128i*)(sp));
1713  u3 = _mm_and_si128(s0, mask_3);
1714  u3 = _mm_srli_epi32(u3, 4);
1715  uC = _mm_and_si128(s0, mask_C);
1716  uC = _mm_srli_epi32(uC, 2);
1717  t0 = _mm_or_si128(u3, uC);
1718 
1719  s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1720  u3 = _mm_and_si128(s1, mask_3);
1721  u3 = _mm_srli_epi32(u3, 2);
1722  uC = _mm_and_si128(s1, mask_C);
1723  t1 = _mm_or_si128(u3, uC);
1724 
1725  __m128i r = _mm_or_si128(t0, t1);
1726  r = _mm_shuffle_epi8(r, shuffle_mask);
1727 
1728  // _mm_storeu_si32 is not defined, so we use this workaround
1729  _mm_store_ss((float*)dp, _mm_castsi128_ps(r));
1730  }
1731  dp[0] = 0; // set an extra entry on the right with 0
1732  }
1733  {
1734  // reset one row after the codeblock
1735  ui16* dp = sigma + (y >> 2) * mstr;
1736  __m128i zero = _mm_setzero_si128();
1737  for (ui32 x = 0; x < width; x += 32, dp += 8)
1738  _mm_store_si128((__m128i*)dp, zero);
1739  dp[0] = 0; // set an extra entry on the right with 0
1740  }
1741  }
1742 
1743  // We perform Significance Propagation Pass here
1744  {
1745  // This stores significance information of the previous
1746  // 4 rows. Significance information in this array includes
1747  // all signicant samples in bitplane p - 1; that is,
1748  // significant samples for bitplane p (discovered during the
1749  // cleanup pass and stored in sigma) and samples that have recently
1750  // became significant (during the SPP) in bitplane p-1.
1751  // We store enough for the widest row, containing 1024 columns,
1752  // which is equivalent to 256 of ui16, since each stores 4 columns.
1753  // We add an extra 8 entries, just in case we need more
1754  ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1755 
1756  frwd_struct sigprop;
1757  frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1758 
1759  for (ui32 y = 0; y < height; y += 4)
1760  {
1761  ui32 pattern = 0xFFFFu; // a pattern needed samples
1762  if (height - y < 4) {
1763  pattern = 0x7777u;
1764  if (height - y < 3) {
1765  pattern = 0x3333u;
1766  if (height - y < 2)
1767  pattern = 0x1111u;
1768  }
1769  }
1770 
1771  // prev holds sign. info. for the previous quad, together
1772  // with the rows on top of it and below it.
1773  ui32 prev = 0;
1774  ui16 *prev_sig = prev_row_sig;
1775  ui16 *cur_sig = sigma + (y >> 2) * mstr;
1776  ui32 *dpp = decoded_data + y * stride;
1777  for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1778  {
1779  // only rows and columns inside the stripe are included
1780  si32 s = (si32)x + 4 - (si32)width;
1781  s = ojph_max(s, 0);
1782  pattern = pattern >> (s * 4);
1783 
1784  // We first find locations that need to be tested (potential
1785  // SPP members); these location will end up in mbr
1786  // In each iteration, we produce 16 bits because cwd can have
1787  // up to 16 bits of significance information, followed by the
1788  // corresponding 16 bits of sign information; therefore, it is
1789  // sufficient to fetch 32 bit data per loop.
1790 
1791  // Althougth we are interested in 16 bits only, we load 32 bits.
1792  // For the 16 bits we are producing, we need the next 4 bits --
1793  // We need data for at least 5 columns out of 8.
1794  // Therefore loading 32 bits is easier than loading 16 bits
1795  // twice.
1796  ui32 ps = *(ui32*)prev_sig;
1797  ui32 ns = *(ui32*)(cur_sig + mstr);
1798  ui32 u = (ps & 0x88888888) >> 3; // the row on top
1799  if (!stripe_causal)
1800  u |= (ns & 0x11111111) << 3; // the row below
1801 
1802  ui32 cs = *(ui32*)cur_sig;
1803  // vertical integration
1804  ui32 mbr = cs; // this sig. info.
1805  mbr |= (cs & 0x77777777) << 1; //above neighbors
1806  mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1807  mbr |= u;
1808  // horizontal integration
1809  ui32 t = mbr;
1810  mbr |= t << 4; // neighbors on the left
1811  mbr |= t >> 4; // neighbors on the right
1812  mbr |= prev >> 12; // significance of previous group
1813 
1814  // remove outside samples, and already significant samples
1815  mbr &= pattern;
1816  mbr &= ~cs;
1817 
1818  // find samples that become significant during the SPP
1819  ui32 new_sig = mbr;
1820  if (new_sig)
1821  {
1822  __m128i cwd_vec = frwd_fetch<0>(&sigprop);
1823  ui32 cwd = (ui32)_mm_extract_epi16(cwd_vec, 0);
1824 
1825  ui32 cnt = 0;
1826  ui32 col_mask = 0xFu;
1827  ui32 inv_sig = ~cs & pattern;
1828  for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1829  {
1830  if ((col_mask & new_sig) == 0)
1831  continue;
1832 
1833  //scan one column
1834  ui32 sample_mask = 0x1111u & col_mask;
1835  if (new_sig & sample_mask)
1836  {
1837  new_sig &= ~sample_mask;
1838  if (cwd & 1)
1839  {
1840  ui32 t = 0x33u << i;
1841  new_sig |= t & inv_sig;
1842  }
1843  cwd >>= 1; ++cnt;
1844  }
1845 
1846  sample_mask <<= 1;
1847  if (new_sig & sample_mask)
1848  {
1849  new_sig &= ~sample_mask;
1850  if (cwd & 1)
1851  {
1852  ui32 t = 0x76u << i;
1853  new_sig |= t & inv_sig;
1854  }
1855  cwd >>= 1; ++cnt;
1856  }
1857 
1858  sample_mask <<= 1;
1859  if (new_sig & sample_mask)
1860  {
1861  new_sig &= ~sample_mask;
1862  if (cwd & 1)
1863  {
1864  ui32 t = 0xECu << i;
1865  new_sig |= t & inv_sig;
1866  }
1867  cwd >>= 1; ++cnt;
1868  }
1869 
1870  sample_mask <<= 1;
1871  if (new_sig & sample_mask)
1872  {
1873  new_sig &= ~sample_mask;
1874  if (cwd & 1)
1875  {
1876  ui32 t = 0xC8u << i;
1877  new_sig |= t & inv_sig;
1878  }
1879  cwd >>= 1; ++cnt;
1880  }
1881  }
1882 
1883  if (new_sig)
1884  {
1885  cwd |= (ui32)_mm_extract_epi16(cwd_vec, 1) << (16 - cnt);
1886 
1887  // Spread new_sig, such that each bit is in one byte with a
1888  // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1889  __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1890  new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1891  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1892  new_sig_vec = _mm_and_si128(new_sig_vec,
1893  _mm_set1_epi64x((si64)0x8040201008040201));
1894  new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1895  _mm_set1_epi64x((si64)0x8040201008040201));
1896 
1897  // find cumulative sums
1898  // to find which bit in cwd we should extract
1899  __m128i inc_sum = new_sig_vec; // inclusive scan
1900  inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1901  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1902  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1903  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1904  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1905  cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1906  // exclusive scan
1907  __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1908 
1909  // Spread cwd, such that each bit is in one byte
1910  // with a value of 0 or 1.
1911  cwd_vec = _mm_set1_epi16((si16)cwd);
1912  cwd_vec = _mm_shuffle_epi8(cwd_vec,
1913  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1914  cwd_vec = _mm_and_si128(cwd_vec,
1915  _mm_set1_epi64x((si64)0x8040201008040201));
1916  cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1917  _mm_set1_epi64x((si64)0x8040201008040201));
1918  cwd_vec = _mm_abs_epi8(cwd_vec);
1919 
1920  // Obtain bit from cwd_vec correspondig to ex_sum
1921  // Basically, collect needed bits from cwd_vec
1922  __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1923 
1924  // load data and set spp coefficients
1925  __m128i m =
1926  _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1927  __m128i val = _mm_set1_epi32(3 << (p - 2));
1928  ui32 *dp = dpp;
1929  for (int c = 0; c < 4; ++ c) {
1930  __m128i s0, s0_ns, s0_val;
1931  // load coefficients
1932  s0 = _mm_load_si128((__m128i*)dp);
1933 
1934  // epi32 is -1 only for coefficient that
1935  // are changed during the SPP
1936  s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1937  s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1938 
1939  // obtain sign for coefficients in SPP
1940  s0_val = _mm_shuffle_epi8(v, m);
1941  s0_val = _mm_slli_epi32(s0_val, 31);
1942  s0_val = _mm_or_si128(s0_val, val);
1943  s0_val = _mm_and_si128(s0_val, s0_ns);
1944 
1945  // update vector
1946  s0 = _mm_or_si128(s0, s0_val);
1947  // store coefficients
1948  _mm_store_si128((__m128i*)dp, s0);
1949  // prepare for next row
1950  dp += stride;
1951  m = _mm_add_epi32(m, _mm_set1_epi32(1));
1952  }
1953  }
1954  frwd_advance(&sigprop, cnt);
1955  }
1956 
1957  new_sig |= cs;
1958  *prev_sig = (ui16)(new_sig);
1959 
1960  // vertical integration for the new sig. info.
1961  t = new_sig;
1962  new_sig |= (t & 0x7777) << 1; //above neighbors
1963  new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1964  // add sig. info. from the row on top and below
1965  prev = new_sig | u;
1966  // we need only the bits in 0xF000
1967  prev &= 0xF000;
1968  }
1969  }
1970  }
1971 
1972  // We perform Magnitude Refinement Pass here
1973  if (num_passes > 2)
1974  {
1975  rev_struct magref;
1976  rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
1977 
1978  for (ui32 y = 0; y < height; y += 4)
1979  {
1980  ui16 *cur_sig = sigma + (y >> 2) * mstr;
1981  ui32 *dpp = decoded_data + y * stride;
1982  for (ui32 i = 0; i < width; i += 4, dpp += 4)
1983  {
1984  //Process one entry from sigma array at a time
1985  // Each nibble (4 bits) in the sigma array represents 4 rows,
1986  ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
1987  ui16 sig = *cur_sig++; // 16 bit that will be processed now
1988  int total_bits = 0;
1989  if (sig) // if any of the 32 bits are set
1990  {
1991  // We work on 4 rows, with 4 samples each, since
1992  // data is 32 bit (4 bytes)
1993 
1994  // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1995  __m128i sig_vec = _mm_set1_epi16((si16)sig);
1996  sig_vec = _mm_shuffle_epi8(sig_vec,
1997  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1998  sig_vec = _mm_and_si128(sig_vec,
1999  _mm_set1_epi64x((si64)0x8040201008040201));
2000  sig_vec = _mm_cmpeq_epi8(sig_vec,
2001  _mm_set1_epi64x((si64)0x8040201008040201));
2002  sig_vec = _mm_abs_epi8(sig_vec);
2003 
2004  // find cumulative sums
2005  // to find which bit in cwd we should extract
2006  __m128i inc_sum = sig_vec; // inclusive scan
2007  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
2008  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
2009  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
2010  inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
2011  total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
2012  __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
2013 
2014  // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2015  // cwd_vec. Then, convert these to a form suitable
2016  // for coefficient modifications; in particular, a value
2017  // of 0 is presented as binary 11, and a value of 1 is
2018  // represented as binary 01
2019  __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
2020  cwd_vec = _mm_shuffle_epi8(cwd_vec,
2021  _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
2022  cwd_vec = _mm_and_si128(cwd_vec,
2023  _mm_set1_epi64x((si64)0x8040201008040201));
2024  cwd_vec = _mm_cmpeq_epi8(cwd_vec,
2025  _mm_set1_epi64x((si64)0x8040201008040201));
2026  cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
2027  cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
2028  cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
2029 
2030  // load data and insert the mrp bit
2031  __m128i m =
2032  _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
2033  ui32 *dp = dpp;
2034  for (int c = 0; c < 4; ++c) {
2035  __m128i s0, s0_sig, s0_idx, s0_val;
2036  // load coefficients
2037  s0 = _mm_load_si128((__m128i*)dp);
2038  // find significant samples in this row
2039  s0_sig = _mm_shuffle_epi8(sig_vec, m);
2040  s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
2041  // get MRP bit index, and MRP pattern
2042  s0_idx = _mm_shuffle_epi8(ex_sum, m);
2043  s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
2044  // keep data from significant samples only
2045  s0_val = _mm_andnot_si128(s0_sig, s0_val);
2046  // move mrp bits to correct position, and employ
2047  s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
2048  s0 = _mm_xor_si128(s0, s0_val);
2049  // store coefficients
2050  _mm_store_si128((__m128i*)dp, s0);
2051  // prepare for next row
2052  dp += stride;
2053  m = _mm_add_epi32(m, _mm_set1_epi32(1));
2054  }
2055  }
2056  // consume data according to the number of bits set
2057  rev_advance_mrp(&magref, (ui32)total_bits);
2058  }
2059  }
2060  }
2061  }
2062 
2063  return true;
2064  }
2065  }
2066 }
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct.
bool ojph_decode_codeblock_ssse3(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
static __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_read(frwd_struct *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static ui32 frwd_fetch(frwd_struct *msp)
Fetches 32 bits from the frwd_struct bitstream.
static void frwd_init(frwd_struct *msp, const ui8 *data, int size)
Initialize frwd_struct struct and reads some bytes.
static __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes one quad, using 32 bit data
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition: ojph_defs.h:57
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition: ojph_arch.h:109
int32_t si32
Definition: ojph_defs.h:55
int16_t si16
Definition: ojph_defs.h:53
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define OJPH_WARN(t,...)
Definition: ojph_message.h:128
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui32 bits
number of bits stored in tmp
ui64 tmp
temporary buffer of read data
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data