OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_block_decoder_wasm.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2022, Aous Naman
6 // Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2022, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_block_decoder_wasm.cpp
34 // Author: Aous Naman
35 // Date: 13 May 2022
36 //***************************************************************************/
37 
38 //***************************************************************************/
43 #include <string>
44 #include <iostream>
45 
46 #include <cassert>
47 #include <cstring>
48 #include "ojph_block_common.h"
49 #include "ojph_block_decoder.h"
50 #include "ojph_arch.h"
51 #include "ojph_message.h"
52 
53 #include <wasm_simd128.h>
54 
55 namespace ojph {
56  namespace local {
57 
58  //************************************************************************/
61  #define OJPH_REPEAT2(a) a,a
62  #define OJPH_REPEAT4(a) a,a,a,a
63  #define OJPH_REPEAT8(a) a,a,a,a,a,a,a,a
64  #define OJPH_REPEAT16(a) a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
65 
66  //************************************************************************/
73  struct dec_mel_st {
74  dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
75  k(0), num_runs(0), runs(0)
76  {}
77  // data decoding machinary
78  ui8* data;
79  ui64 tmp;
80  int bits;
81  int size;
82  bool unstuff;
83  int k;
84 
85  // queue of decoded runs
86  int num_runs;
87  ui64 runs;
88  };
89 
90  //************************************************************************/
102  static inline
103  void mel_read(dec_mel_st *melp)
104  {
105  if (melp->bits > 32) //there are enough bits in the tmp variable
106  return; // return without reading new data
107 
108  ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
109  if (melp->size > 4) { // if there is data in the MEL segment
110  val = *(ui32*)melp->data; // read 32 bits from MEL data
111  melp->data += 4; // advance pointer
112  melp->size -= 4; // reduce counter
113  }
114  else if (melp->size > 0)
115  { // 4 or less
116  int i = 0;
117  while (melp->size > 1) {
118  ui32 v = *melp->data++; // read one byte at a time
119  ui32 m = ~(0xFFu << i); // mask of location
120  val = (val & m) | (v << i);// put one byte in its correct location
121  --melp->size;
122  i += 8;
123  }
124  // size equal to 1
125  ui32 v = *melp->data++; // the one before the last is different
126  v |= 0xF; // MEL and VLC segments can overlap
127  ui32 m = ~(0xFFu << i);
128  val = (val & m) | (v << i);
129  --melp->size;
130  }
131 
132  // next we unstuff them before adding them to the buffer
133  int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
134  // the previously read byte requires
135  // unstuffing
136 
137  // data is unstuffed and accumulated in t
138  // bits has the number of bits in t
139  ui32 t = val & 0xFF;
140  bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
141  bits -= unstuff; // there is one less bit in t if unstuffing is needed
142  t = t << (8 - unstuff); // move up to make room for the next byte
143 
144  //this is a repeat of the above
145  t |= (val>>8) & 0xFF;
146  unstuff = (((val >> 8) & 0xFF) == 0xFF);
147  bits -= unstuff;
148  t = t << (8 - unstuff);
149 
150  t |= (val>>16) & 0xFF;
151  unstuff = (((val >> 16) & 0xFF) == 0xFF);
152  bits -= unstuff;
153  t = t << (8 - unstuff);
154 
155  t |= (val>>24) & 0xFF;
156  melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
157 
158  // move t to tmp, and push the result all the way up, so we read from
159  // the MSB
160  melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
161  melp->bits += bits; //increment the number of bits in tmp
162  }
163 
164  //************************************************************************/
179  static inline
180  void mel_decode(dec_mel_st *melp)
181  {
182  static const int mel_exp[13] = { //MEL exponents
183  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
184  };
185 
186  if (melp->bits < 6) // if there are less than 6 bits in tmp
187  mel_read(melp); // then read from the MEL bitstream
188  // 6 bits is the largest decodable MEL cwd
189 
190  //repeat so long that there is enough decodable bits in tmp,
191  // and the runs store is not full (num_runs < 8)
192  while (melp->bits >= 6 && melp->num_runs < 8)
193  {
194  int eval = mel_exp[melp->k]; // number of bits associated with state
195  int run = 0;
196  if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
197  { //one is found
198  run = 1 << eval;
199  run--; // consecutive runs of 0 events - 1
200  melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
201  melp->tmp <<= 1; // consume one bit from tmp
202  melp->bits -= 1;
203  run = run << 1; // a stretch of zeros not terminating in one
204  }
205  else
206  { //0 is found
207  run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
208  melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
209  melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
210  melp->bits -= eval + 1;
211  run = (run << 1) + 1; // a stretch of zeros terminating with one
212  }
213  eval = melp->num_runs * 7; // 7 bits per run
214  melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
215  melp->runs |= ((ui64)run) << eval; // store the value in runs
216  melp->num_runs++; // increment count
217  }
218  }
219 
220  //************************************************************************/
230  static inline
231  void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
232  {
233  melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
234  melp->bits = 0; // 0 bits in tmp
235  melp->tmp = 0; //
236  melp->unstuff = false; // no unstuffing
237  melp->size = scup - 1; // size is the length of MEL+VLC-1
238  melp->k = 0; // 0 for state
239  melp->num_runs = 0; // num_runs is 0
240  melp->runs = 0; //
241 
242  //This code is borrowed; original is for a different architecture
243  //These few lines take care of the case where data is not at a multiple
244  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
245  int num = 4 - (int)(intptr_t(melp->data) & 0x3);
246  for (int i = 0; i < num; ++i) { // this code is similar to mel_read
247  assert(melp->unstuff == false || melp->data[0] <= 0x8F);
248  ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
249  //set data to 0xFF
250  if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
251  // see the standard
252  melp->data += melp->size-- > 0; //increment if the end is not reached
253  int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
254  melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
255  melp->bits += d_bits; //increment tmp by number of bits
256  melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
257  //unstuffing
258  }
259  melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
260  // is the MSB
261  }
262 
263  //************************************************************************/
269  static inline
271  {
272  if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
273  mel_decode(melp);
274 
275  int t = melp->runs & 0x7F; //retrieve one run
276  melp->runs >>= 7; // remove the retrieved run
277  melp->num_runs--;
278  return t; // return run
279  }
280 
281  //************************************************************************/
285  struct rev_struct {
286  rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
287  {}
288  //storage
289  ui8* data;
290  ui64 tmp;
291  ui32 bits;
292  int size;
293  bool unstuff;
295  };
296 
297  //************************************************************************/
317  static inline
318  void rev_read(rev_struct *vlcp)
319  {
320  //process 4 bytes at a time
321  if (vlcp->bits > 32) // if there are more than 32 bits in tmp, then
322  return; // reading 32 bits can overflow vlcp->tmp
323  ui32 val = 0;
324  //the next line (the if statement) needs to be tested first
325  if (vlcp->size > 3) // if there are more than 3 bytes left in VLC
326  {
327  // (vlcp->data - 3) move pointer back to read 32 bits at once
328  val = *(ui32*)(vlcp->data - 3); // then read 32 bits
329  vlcp->data -= 4; // move data pointer back by 4
330  vlcp->size -= 4; // reduce available byte by 4
331  }
332  else if (vlcp->size > 0)
333  { // 4 or less
334  int i = 24;
335  while (vlcp->size > 0) {
336  ui32 v = *vlcp->data--; // read one byte at a time
337  val |= (v << i); // put byte in its correct location
338  --vlcp->size;
339  i -= 8;
340  }
341  }
342 
343  //accumulate in tmp, number of bits in tmp are stored in bits
344  ui32 tmp = val >> 24; //start with the MSB byte
345  ui32 bits;
346 
347  // test unstuff (previous byte is >0x8F), and this byte is 0x7F
348  bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
349  bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
350 
351  tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
352  bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
353  unstuff = ((val >> 16) & 0xFF) > 0x8F;
354 
355  tmp |= ((val >> 8) & 0xFF) << bits;
356  bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
357  unstuff = ((val >> 8) & 0xFF) > 0x8F;
358 
359  tmp |= (val & 0xFF) << bits;
360  bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
361  unstuff = (val & 0xFF) > 0x8F;
362 
363  // now move the read and unstuffed bits into vlcp->tmp
364  vlcp->tmp |= (ui64)tmp << vlcp->bits;
365  vlcp->bits += bits;
366  vlcp->unstuff = unstuff; // this for the next read
367  }
368 
369  //************************************************************************/
383  static inline
384  void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
385  {
386  //first byte has only the upper 4 bits
387  vlcp->data = data + lcup - 2;
388 
389  //size can not be larger than this, in fact it should be smaller
390  vlcp->size = scup - 2;
391 
392  ui32 d = *vlcp->data--; // read one byte (this is a half byte)
393  vlcp->tmp = d >> 4; // both initialize and set
394  vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
395  vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
396 
397  //This code is designed for an architecture that read address should
398  // align to the read size (address multiple of 4 if read size is 4)
399  //These few lines take care of the case where data is not at a multiple
400  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
401  // To read 32 bits, read from (vlcp->data - 3)
402  int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
403  int tnum = num < vlcp->size ? num : vlcp->size;
404  for (int i = 0; i < tnum; ++i) {
405  ui64 d;
406  d = *vlcp->data--; // read one byte and move read pointer
407  //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
408  ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
409  vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
410  vlcp->bits += d_bits;
411  vlcp->unstuff = d > 0x8F; // for next byte
412  }
413  vlcp->size -= tnum;
414  rev_read(vlcp); // read another 32 buts
415  }
416 
417  //************************************************************************/
424  static inline
426  {
427  if (vlcp->bits < 32) // if there are less then 32 bits, read more
428  {
429  rev_read(vlcp); // read 32 bits, but unstuffing might reduce this
430  if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
431  rev_read(vlcp); // read another 32
432  }
433  return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
434  }
435 
436  //************************************************************************/
442  static inline
443  ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
444  {
445  assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
446  vlcp->tmp >>= num_bits; // remove bits
447  vlcp->bits -= num_bits; // decrement the number of bits
448  return (ui32)vlcp->tmp;
449  }
450 
451  //************************************************************************/
462  static inline
464  {
465  //process 4 bytes at a time
466  if (mrp->bits > 32)
467  return;
468  ui32 val = 0;
469  if (mrp->size > 3) // If there are 3 byte or more
470  { // (mrp->data - 3) move pointer back to read 32 bits at once
471  val = *(ui32*)(mrp->data - 3); // read 32 bits
472  mrp->data -= 4; // move back pointer
473  mrp->size -= 4; // reduce count
474  }
475  else if (mrp->size > 0)
476  {
477  int i = 24;
478  while (mrp->size > 0) {
479  ui32 v = *mrp->data--; // read one byte at a time
480  val |= (v << i); // put byte in its correct location
481  --mrp->size;
482  i -= 8;
483  }
484  }
485 
486  //accumulate in tmp, and keep count in bits
487  ui32 bits, tmp = val >> 24;
488 
489  //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
490  bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
491  bool unstuff = (val >> 24) > 0x8F;
492 
493  //process the next byte
494  tmp |= ((val >> 16) & 0xFF) << bits;
495  bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
496  unstuff = ((val >> 16) & 0xFF) > 0x8F;
497 
498  tmp |= ((val >> 8) & 0xFF) << bits;
499  bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
500  unstuff = ((val >> 8) & 0xFF) > 0x8F;
501 
502  tmp |= (val & 0xFF) << bits;
503  bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
504  unstuff = (val & 0xFF) > 0x8F;
505 
506  mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
507  mrp->bits += bits;
508  mrp->unstuff = unstuff; // next byte
509  }
510 
511  //************************************************************************/
526  static inline
527  void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
528  {
529  mrp->data = data + lcup + len2 - 1;
530  mrp->size = len2;
531  mrp->unstuff = true;
532  mrp->bits = 0;
533  mrp->tmp = 0;
534 
535  //This code is designed for an architecture that read address should
536  // align to the read size (address multiple of 4 if read size is 4)
537  //These few lines take care of the case where data is not at a multiple
538  // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream
539  int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
540  for (int i = 0; i < num; ++i) {
541  ui64 d;
542  //read a byte, 0 if no more data
543  d = (mrp->size-- > 0) ? *mrp->data-- : 0;
544  //check if unstuffing is needed
545  ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
546  mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
547  mrp->bits += d_bits;
548  mrp->unstuff = d > 0x8F; // for next byte
549  }
550  rev_read_mrp(mrp);
551  }
552 
553  //************************************************************************/
560  static inline
562  {
563  if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
564  {
565  rev_read_mrp(mrp); // read 30-32 bits from mrp
566  if (mrp->bits < 32) // if there is a space of 32 bits
567  rev_read_mrp(mrp); // read more
568  }
569  return (ui32)mrp->tmp; // return the head of mrp->tmp
570  }
571 
572  //************************************************************************/
578  inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
579  {
580  assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
581  mrp->tmp >>= num_bits; // discard the lowest num_bits bits
582  mrp->bits -= num_bits;
583  return (ui32)mrp->tmp; // return data after consumption
584  }
585 
586  //************************************************************************/
590  struct frwd_struct {
591  const ui8* data;
592  ui8 tmp[48];
593  ui32 bits;
594  ui32 unstuff;
595  int size;
596  };
597 
598  //************************************************************************/
616  template<int X>
617  static inline
619  {
620  assert(msp->bits <= 128);
621 
622  v128_t offset, val, validity, all_xff;
623  val = wasm_v128_load(msp->data);
624  int bytes = msp->size >= 16 ? 16 : msp->size;
625  validity = wasm_i8x16_splat((char)bytes);
626  msp->data += bytes;
627  msp->size -= bytes;
628  ui32 bits = 128;
629  offset = wasm_i64x2_const(0x0706050403020100,0x0F0E0D0C0B0A0908);
630  validity = wasm_i8x16_gt(validity, offset);
631  all_xff = wasm_i8x16_const(OJPH_REPEAT16(-1));
632  if (X == 0xFF) // the compiler should remove this if statement
633  {
634  v128_t t = wasm_v128_xor(validity, all_xff); // complement
635  val = wasm_v128_or(t, val); // fill with 0xFF
636  }
637  else if (X == 0)
638  val = wasm_v128_and(validity, val); // fill with zeros
639  else
640  assert(0);
641 
642  v128_t ff_bytes;
643  ff_bytes = wasm_i8x16_eq(val, all_xff);
644  ff_bytes = wasm_v128_and(ff_bytes, validity);
645  ui32 flags = wasm_i8x16_bitmask(ff_bytes);
646  flags <<= 1; // unstuff following byte
647  ui32 next_unstuff = flags >> 16;
648  flags |= msp->unstuff;
649  flags &= 0xFFFF;
650  while (flags)
651  { // bit unstuffing occurs on average once every 256 bytes
652  // therefore it is not an issue if it is a bit slow
653  // here we process 16 bytes
654  --bits; // consuming one stuffing bit
655 
656  ui32 loc = 31 - count_leading_zeros(flags);
657  flags ^= 1 << loc;
658 
659  v128_t m, t, c;
660  t = wasm_i8x16_splat((char)loc);
661  m = wasm_i8x16_gt(offset, t);
662 
663  t = wasm_v128_and(m, val); // keep bits at locations larger than loc
664  c = wasm_u64x2_shr(t, 1); // 1 bits left
665  t = wasm_i64x2_shuffle(t, wasm_i64x2_const(0, 0), 1, 2);
666  t = wasm_i64x2_shl(t, 63); // keep the MSB only
667  t = wasm_v128_or(t, c); // combine the above 3 steps
668 
669  val = wasm_v128_or(t, wasm_v128_andnot(val, m));
670  }
671 
672  // combine with earlier data
673  assert(msp->bits >= 0 && msp->bits <= 128);
674  int cur_bytes = msp->bits >> 3;
675  ui32 cur_bits = msp->bits & 7;
676  v128_t b1, b2;
677  b1 = wasm_i64x2_shl(val, cur_bits);
678  //next shift 8 bytes right
679  b2 = wasm_i64x2_shuffle(wasm_i64x2_const(0, 0), val, 1, 2);
680  b2 = wasm_u64x2_shr(b2, 64u - cur_bits);
681  b2 = (cur_bits > 0) ? b2 : wasm_i64x2_const(0, 0);
682  b1 = wasm_v128_or(b1, b2);
683  b2 = wasm_v128_load(msp->tmp + cur_bytes);
684  b2 = wasm_v128_or(b1, b2);
685  wasm_v128_store(msp->tmp + cur_bytes, b2);
686 
687  ui32 consumed_bits = bits < 128u - cur_bits ? bits : 128u - cur_bits;
688  cur_bytes = (msp->bits + consumed_bits + 7) >> 3; // round up
689  int upper = wasm_u16x8_extract_lane(val, 7);
690  upper >>= consumed_bits + 16 - 128;
691  msp->tmp[cur_bytes] = (ui8)upper; // copy byte
692 
693  msp->bits += bits;
694  msp->unstuff = next_unstuff; // next unstuff
695  assert(msp->unstuff == 0 || msp->unstuff == 1);
696  }
697 
698  //************************************************************************/
707  template<int X>
708  static inline
709  void frwd_init(frwd_struct *msp, const ui8* data, int size)
710  {
711  msp->data = data;
712  wasm_v128_store(msp->tmp, wasm_i64x2_const(0, 0));
713  wasm_v128_store(msp->tmp + 16, wasm_i64x2_const(0, 0));
714  wasm_v128_store(msp->tmp + 32, wasm_i64x2_const(0, 0));
715 
716  msp->bits = 0;
717  msp->unstuff = 0;
718  msp->size = size;
719 
720  frwd_read<X>(msp); // read 128 bits more
721  }
722 
723  //************************************************************************/
729  static inline
730  void frwd_advance(frwd_struct *msp, ui32 num_bits)
731  {
732  assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
733  msp->bits -= num_bits;
734 
735  v128_t *p = (v128_t*)(msp->tmp + ((num_bits >> 3) & 0x18));
736  num_bits &= 63;
737 
738  v128_t v0, v1, c0, c1, t;
739  v0 = wasm_v128_load(p);
740  v1 = wasm_v128_load(p + 1);
741 
742  // shift right by num_bits
743  c0 = wasm_u64x2_shr(v0, num_bits);
744  t = wasm_i64x2_shuffle(v0, wasm_i64x2_const(0, 0), 1, 2);
745  t = wasm_i64x2_shl(t, 64 - num_bits);
746  t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
747  c0 = wasm_v128_or(c0, t);
748  t = wasm_i64x2_shuffle(wasm_i64x2_const(0, 0), v1, 1, 2);
749  t = wasm_i64x2_shl(t, 64 - num_bits);
750  t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
751  c0 = wasm_v128_or(c0, t);
752 
753  wasm_v128_store(msp->tmp, c0);
754 
755  c1 = wasm_u64x2_shr(v1, num_bits);
756  t = wasm_i64x2_shuffle(v1, wasm_i64x2_const(0, 0), 1, 2);
757  t = wasm_i64x2_shl(t, 64 - num_bits);
758  t = (num_bits > 0) ? t : wasm_i64x2_const(0, 0);
759  c1 = wasm_v128_or(c1, t);
760 
761  wasm_v128_store(msp->tmp + 16, c1);
762  }
763 
764  //************************************************************************/
771  template<int X>
772  static inline
773  v128_t frwd_fetch(frwd_struct *msp)
774  {
775  if (msp->bits <= 128)
776  {
777  frwd_read<X>(msp);
778  if (msp->bits <= 128) //need to test
779  frwd_read<X>(msp);
780  }
781  v128_t t = wasm_v128_load(msp->tmp);
782  return t;
783  }
784 
785  //************************************************************************/
797  template <int N>
798  static inline
799  v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q,
800  frwd_struct* magsgn, ui32 p, v128_t& vn)
801  {
802  v128_t w0; // workers
803  v128_t insig; // lanes hold FF's if samples are insignificant
804  v128_t flags; // lanes hold e_k, e_1, and rho
805  v128_t row; // decoded row
806 
807  row = wasm_i64x2_const(0, 0);
808  w0 = wasm_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N);
809  // we keeps e_k, e_1, and rho in w2
810  flags = wasm_v128_and(w0, wasm_i32x4_const(0x1110,0x2220,0x4440,0x8880));
811  insig = wasm_i32x4_eq(flags, wasm_i64x2_const(0, 0));
812  if (wasm_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
813  {
814  U_q = wasm_i32x4_shuffle(U_q, U_q, N, N, N, N);
815  flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,8,4,4,2,2,1,1));
816  v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
817 
818  // U_q holds U_q for this quad
819  // flags has e_k, e_1, and rho such that e_k is sitting in the
820  // 0x8000, e_1 in 0x800, and rho in 0x80
821 
822  // next e_k and m_n
823  v128_t m_n;
824  w0 = wasm_u32x4_shr(flags, 15); // e_k
825  m_n = wasm_i32x4_sub(U_q, w0);
826  m_n = wasm_v128_andnot(m_n, insig);
827 
828  // find cumulative sums
829  // to find at which bit in ms_vec the sample starts
830  v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
831  shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
832  inc_sum = wasm_i32x4_add(inc_sum, shfl);
833  shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
834  inc_sum = wasm_i32x4_add(inc_sum, shfl);
835  int total_mn = wasm_u16x8_extract_lane(inc_sum, 6);
836  ex_sum = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
837 
838  // find the starting byte and starting bit
839  v128_t byte_idx = wasm_u32x4_shr(ex_sum, 3);
840  v128_t bit_idx =
841  wasm_v128_and(ex_sum, wasm_i32x4_const(OJPH_REPEAT4(7)));
842  byte_idx = wasm_i8x16_swizzle(byte_idx,
843  wasm_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C));
844  byte_idx =
845  wasm_i32x4_add(byte_idx, wasm_i32x4_const(OJPH_REPEAT4(0x03020100)));
846  v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
847  byte_idx =
848  wasm_i32x4_add(byte_idx, wasm_i32x4_const(OJPH_REPEAT4(0x01010101)));
849  v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
850 
851  // shift samples values to correct location
852  bit_idx = wasm_v128_or(bit_idx, wasm_i32x4_shl(bit_idx, 16));
853  v128_t bit_shift = wasm_i8x16_swizzle(
854  wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
855  -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
856  bit_shift =
857  wasm_i16x8_add(bit_shift, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
858  d0 = wasm_i16x8_mul(d0, bit_shift);
859  d0 = wasm_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
860  d1 = wasm_i16x8_mul(d1, bit_shift);
861  d1 = // 8 in MSB
862  wasm_v128_and(d1, wasm_u32x4_const(OJPH_REPEAT4(0xFF00FF00)));
863  d0 = wasm_v128_or(d0, d1);
864 
865  // find location of e_k and mask
866  v128_t shift;
867  v128_t ones = wasm_i32x4_const(OJPH_REPEAT4(1));
868  v128_t twos = wasm_i32x4_const(OJPH_REPEAT4(2));
869  ui32 U_q_m1 = wasm_u32x4_extract_lane(U_q, 0) - 1u;
870  w0 = wasm_i32x4_sub(twos, w0);
871  shift = wasm_i32x4_shl(w0, U_q_m1);
872  ms_vec = wasm_v128_and(d0, wasm_i32x4_sub(shift, ones));
873 
874  // next e_1
875  w0 = wasm_v128_and(flags, wasm_i32x4_const(OJPH_REPEAT4(0x800)));
876  w0 = wasm_i32x4_eq(w0, wasm_i64x2_const(0, 0));
877  w0 = wasm_v128_andnot(shift, w0); // e_1 in correct position
878  ms_vec = wasm_v128_or(ms_vec, w0); // e_1
879  w0 = wasm_i32x4_shl(ms_vec, 31); // sign
880  ms_vec = wasm_v128_or(ms_vec, ones); // bin center
881  v128_t tvn = ms_vec;
882  ms_vec = wasm_i32x4_add(ms_vec, twos);// + 2
883  ms_vec = wasm_i32x4_shl(ms_vec, p - 1);
884  ms_vec = wasm_v128_or(ms_vec, w0); // sign
885  row = wasm_v128_andnot(ms_vec, insig); // significant only
886 
887  ms_vec = wasm_v128_andnot(tvn, insig); // significant only
888  if (N == 0) // the compiler should remove one
889  tvn = wasm_i8x16_swizzle(ms_vec,
890  wasm_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1));
891  else if (N == 1)
892  tvn = wasm_i8x16_swizzle(ms_vec,
893  wasm_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1));
894  else
895  assert(0);
896  vn = wasm_v128_or(vn, tvn);
897 
898  if (total_mn)
899  frwd_advance(magsgn, (ui32)total_mn);
900  }
901  return row;
902  }
903 
904  //************************************************************************/
914  static inline
915  v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q,
916  frwd_struct* magsgn, ui32 p, v128_t& vn)
917  {
918  v128_t w0; // workers
919  v128_t insig; // lanes hold FF's if samples are insignificant
920  v128_t flags; // lanes hold e_k, e_1, and rho
921  v128_t row; // decoded row
922 
923  row = wasm_i64x2_const(0, 0);
924  w0 = wasm_i8x16_swizzle(inf_u_q,
925  wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
926  0x0504, 0x0504, 0x0504, 0x0504));
927  // we keeps e_k, e_1, and rho in w2
928  flags = wasm_v128_and(w0,
929  wasm_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880,
930  0x1110, 0x2220, 0x4440, 0x8880));
931  insig = wasm_i16x8_eq(flags, wasm_i64x2_const(0, 0));
932  if (wasm_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
933  {
934  U_q = wasm_i8x16_swizzle(U_q,
935  wasm_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
936  0x0504, 0x0504, 0x0504, 0x0504));
937  flags = wasm_i16x8_mul(flags, wasm_i16x8_const(8,4,2,1,8,4,2,1));
938  v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
939 
940  // U_q holds U_q for this quad
941  // flags has e_k, e_1, and rho such that e_k is sitting in the
942  // 0x8000, e_1 in 0x800, and rho in 0x80
943 
944  // next e_k and m_n
945  v128_t m_n;
946  w0 = wasm_u16x8_shr(flags, 15); // e_k
947  m_n = wasm_i16x8_sub(U_q, w0);
948  m_n = wasm_v128_andnot(m_n, insig);
949 
950  // find cumulative sums
951  // to find at which bit in ms_vec the sample starts
952  v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
953  shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
954  inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
955  inc_sum = wasm_i16x8_add(inc_sum, shfl);
956  shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
957  inc_sum = wasm_i16x8_add(inc_sum, shfl);
958  shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum, 1, 2);
959  inc_sum = wasm_i16x8_add(inc_sum, shfl);
960  int total_mn = wasm_u16x8_extract_lane(inc_sum, 7);
961  ex_sum = wasm_i16x8_shuffle(wasm_i64x2_const(0,0),
962  inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
963 
964  // find the starting byte and starting bit
965  v128_t byte_idx = wasm_u16x8_shr(ex_sum, 3);
966  v128_t bit_idx =
967  wasm_v128_and(ex_sum, wasm_i16x8_const(OJPH_REPEAT8(7)));
968  byte_idx = wasm_i8x16_swizzle(byte_idx,
969  wasm_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606,
970  0x0808, 0x0A0A, 0x0C0C, 0x0E0E));
971  byte_idx =
972  wasm_i16x8_add(byte_idx, wasm_i16x8_const(OJPH_REPEAT8(0x0100)));
973  v128_t d0 = wasm_i8x16_swizzle(ms_vec, byte_idx);
974  byte_idx =
975  wasm_i16x8_add(byte_idx, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
976  v128_t d1 = wasm_i8x16_swizzle(ms_vec, byte_idx);
977 
978  // shift samples values to correct location
979  v128_t bit_shift = wasm_i8x16_swizzle(
980  wasm_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
981  -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
982  bit_shift =
983  wasm_i16x8_add(bit_shift, wasm_i16x8_const(OJPH_REPEAT8(0x0101)));
984  d0 = wasm_i16x8_mul(d0, bit_shift);
985  d0 = wasm_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
986  d1 = wasm_i16x8_mul(d1, bit_shift);
987  d1 = // 8 in MSB
988  wasm_v128_and(d1, wasm_i16x8_const(OJPH_REPEAT8((si16)0xFF00)));
989  d0 = wasm_v128_or(d0, d1);
990 
991  // find location of e_k and mask
992  v128_t shift, t0, t1;
993  v128_t ones = wasm_i16x8_const(OJPH_REPEAT8(1));
994  v128_t twos = wasm_i16x8_const(OJPH_REPEAT8(2));
995  v128_t U_q_m1 = wasm_i32x4_sub(U_q, ones);
996  ui32 Uq0 = wasm_u16x8_extract_lane(U_q_m1, 0);
997  ui32 Uq1 = wasm_u16x8_extract_lane(U_q_m1, 4);
998  w0 = wasm_i16x8_sub(twos, w0);
999  t0 = wasm_v128_and(w0, wasm_i64x2_const(-1, 0));
1000  t1 = wasm_v128_and(w0, wasm_i64x2_const(0, -1));
1001  t0 = wasm_i32x4_shl(t0, Uq0);
1002  t1 = wasm_i32x4_shl(t1, Uq1);
1003  shift = wasm_v128_or(t0, t1);
1004  ms_vec = wasm_v128_and(d0, wasm_i16x8_sub(shift, ones));
1005 
1006  // next e_1
1007  w0 = wasm_v128_and(flags, wasm_i16x8_const(OJPH_REPEAT8(0x800)));
1008  w0 = wasm_i16x8_eq(w0, wasm_i64x2_const(0, 0));
1009  w0 = wasm_v128_andnot(shift, w0); // e_1 in correct position
1010  ms_vec = wasm_v128_or(ms_vec, w0); // e_1
1011  w0 = wasm_i16x8_shl(ms_vec, 15); // sign
1012  ms_vec = wasm_v128_or(ms_vec, ones); // bin center
1013  v128_t tvn = ms_vec;
1014  ms_vec = wasm_i16x8_add(ms_vec, twos);// + 2
1015  ms_vec = wasm_i16x8_shl(ms_vec, p - 1);
1016  ms_vec = wasm_v128_or(ms_vec, w0); // sign
1017  row = wasm_v128_andnot(ms_vec, insig); // significant only
1018 
1019  ms_vec = wasm_v128_andnot(tvn, insig); // significant only
1020  w0 = wasm_i8x16_swizzle(ms_vec,
1021  wasm_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1));
1022  vn = wasm_v128_or(vn, w0);
1023  w0 = wasm_i8x16_swizzle(ms_vec,
1024  wasm_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1));
1025  vn = wasm_v128_or(vn, w0);
1026 
1027  if (total_mn)
1028  frwd_advance(magsgn, (ui32)total_mn);
1029  }
1030  return row;
1031  }
1032 
1033 
1034  //************************************************************************/
1051  bool ojph_decode_codeblock_wasm(ui8* coded_data, ui32* decoded_data,
1052  ui32 missing_msbs, ui32 num_passes,
1053  ui32 lengths1, ui32 lengths2,
1054  ui32 width, ui32 height, ui32 stride,
1055  bool stripe_causal)
1056  {
1057  static bool insufficient_precision = false;
1058  static bool modify_code = false;
1059  static bool truncate_spp_mrp = false;
1060 
1061  if (num_passes > 1 && lengths2 == 0)
1062  {
1063  OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1064  "one coding pass, but zero length for "
1065  "2nd and potential 3rd pass.\n");
1066  num_passes = 1;
1067  }
1068 
1069  if (num_passes > 3)
1070  {
1071  OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1072  "This codeblocks has %d passes.\n",
1073  num_passes);
1074  return false;
1075  }
1076 
1077  if (missing_msbs > 30) // p < 0
1078  {
1079  if (insufficient_precision == false)
1080  {
1081  insufficient_precision = true;
1082  OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1083  "codeblock. This message will not be "
1084  "displayed again.\n");
1085  }
1086  return false;
1087  }
1088  else if (missing_msbs == 30) // p == 0
1089  { // not enough precision to decode and set the bin center to 1
1090  if (modify_code == false) {
1091  modify_code = true;
1092  OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1093  "pass. The code can be modified to support "
1094  "this case. This message will not be "
1095  "displayed again.\n");
1096  }
1097  return false; // 32 bits are not enough to decode this
1098  }
1099  else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1100  {
1101  if (num_passes > 1) {
1102  num_passes = 1;
1103  if (truncate_spp_mrp == false) {
1104  truncate_spp_mrp = true;
1105  OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1106  "nor MagRef passes; both will be skipped. "
1107  "This message will not be displayed "
1108  "again.\n");
1109  }
1110  }
1111  }
1112  ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1113  // There is a way to handle the case of p == 0, but a different path
1114  // is required
1115 
1116  if (lengths1 < 2)
1117  {
1118  OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
1119  return false;
1120  }
1121 
1122  // read scup and fix the bytes there
1123  int lcup, scup;
1124  lcup = (int)lengths1; // length of CUP
1125  //scup is the length of MEL + VLC
1126  scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1127  if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1128  return false;
1129 
1130  // The temporary storage scratch holds two types of data in an
1131  // interleaved fashion. The interleaving allows us to use one
1132  // memory pointer.
1133  // We have one entry for a decoded VLC code, and one entry for UVLC.
1134  // Entries are 16 bits each, corresponding to one quad,
1135  // but since we want to use XMM registers of the SSE family
1136  // of SIMD; we allocated 16 bytes or more per quad row; that is,
1137  // the width is no smaller than 16 bytes (or 8 entries), and the
1138  // height is 512 quads
1139  // Each VLC entry contains, in the following order, starting
1140  // from MSB
1141  // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1142  // Each entry in UVLC contains u_q
1143  // One extra row to handle the case of SPP propagating downwards
1144  // when codeblock width is 4
1145  ui16 scratch[8 * 513] = {0}; // 8+ kB
1146 
1147  // We need an extra two entries (one inf and one u_q) beyond
1148  // the last column.
1149  // If the block width is 4 (2 quads), then we use sstr of 8
1150  // (enough for 4 quads). If width is 8 (4 quads) we use
1151  // sstr is 16 (enough for 8 quads). For a width of 16 (8
1152  // quads), we use 24 (enough for 12 quads).
1153  ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1154 
1155  assert((stride & 0x3) == 0);
1156 
1157  ui32 mmsbp2 = missing_msbs + 2;
1158 
1159  // The cleanup pass is decoded in two steps; in step one,
1160  // the VLC and MEL segments are decoded, generating a record that
1161  // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1162  // This information should be sufficient for the next step.
1163  // In step 2, we decode the MagSgn segment.
1164 
1165  // step 1 decoding VLC and MEL segments
1166  {
1167  // init structures
1168  dec_mel_st mel;
1169  mel_init(&mel, coded_data, lcup, scup);
1170  rev_struct vlc;
1171  rev_init(&vlc, coded_data, lcup, scup);
1172 
1173  int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1174  // data represented as runs of 0 events
1175  // See mel_decode description
1176 
1177  ui32 vlc_val;
1178  ui32 c_q = 0;
1179  ui16 *sp = scratch;
1180  //initial quad row
1181  for (ui32 x = 0; x < width; sp += 4)
1182  {
1183  // decode VLC
1185 
1186  // first quad
1187  vlc_val = rev_fetch(&vlc);
1188 
1189  //decode VLC using the context c_q and the head of VLC bitstream
1190  ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1191 
1192  // if context is zero, use one MEL event
1193  if (c_q == 0) //zero context
1194  {
1195  run -= 2; //subtract 2, since events number if multiplied by 2
1196 
1197  // Is the run terminated in 1? if so, use decoded VLC code,
1198  // otherwise, discard decoded data, since we will decoded again
1199  // using a different context
1200  t0 = (run == -1) ? t0 : 0;
1201 
1202  // is run -1 or -2? this means a run has been consumed
1203  if (run < 0)
1204  run = mel_get_run(&mel); // get another run
1205  }
1206  //run -= (c_q == 0) ? 2 : 0;
1207  //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1208  //if (run < 0)
1209  // run = mel_get_run(&mel); // get another run
1210  sp[0] = t0;
1211  x += 2;
1212 
1213  // prepare context for the next quad; eqn. 1 in ITU T.814
1214  c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1215 
1216  //remove data from vlc stream (0 bits are removed if vlc is not used)
1217  vlc_val = rev_advance(&vlc, t0 & 0x7);
1218 
1219  //second quad
1220  ui16 t1 = 0;
1221 
1222  //decode VLC using the context c_q and the head of VLC bitstream
1223  t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1224 
1225  // if context is zero, use one MEL event
1226  if (c_q == 0 && x < width) //zero context
1227  {
1228  run -= 2; //subtract 2, since events number if multiplied by 2
1229 
1230  // if event is 0, discard decoded t1
1231  t1 = (run == -1) ? t1 : 0;
1232 
1233  if (run < 0) // have we consumed all events in a run
1234  run = mel_get_run(&mel); // if yes, then get another run
1235  }
1236  t1 = x < width ? t1 : 0;
1237  //run -= (c_q == 0 && x < width) ? 2 : 0;
1238  //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1239  //if (run < 0)
1240  // run = mel_get_run(&mel); // get another run
1241  sp[2] = t1;
1242  x += 2;
1243 
1244  //prepare context for the next quad, eqn. 1 in ITU T.814
1245  c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1246 
1247  //remove data from vlc stream, if qinf is not used, cwdlen is 0
1248  vlc_val = rev_advance(&vlc, t1 & 0x7);
1249 
1250  // decode u
1252  // uvlc_mode is made up of u_offset bits from the quad pair
1253  ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1254  if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1255  { // the MEL run of events
1256  run -= 2; //subtract 2, since events number if multiplied by 2
1257 
1258  uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1259  // is 0x40
1260 
1261  if (run < 0)//if run is consumed (run is -1 or -2), get another run
1262  run = mel_get_run(&mel);
1263  }
1264  //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1265  //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1266  //if (run < 0)
1267  // run = mel_get_run(&mel); // get another run
1268 
1269  //decode uvlc_mode to get u for both quads
1270  ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1271  //remove total prefix length
1272  vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1273  uvlc_entry >>= 3;
1274  //extract suffixes for quad 0 and 1
1275  ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1276  ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1277  vlc_val = rev_advance(&vlc, len);
1278  uvlc_entry >>= 4;
1279  // quad 0 length
1280  len = uvlc_entry & 0x7; // quad 0 suffix length
1281  uvlc_entry >>= 3;
1282  ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1283  sp[1] = u_q;
1284  u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1285  sp[3] = u_q;
1286  }
1287  sp[0] = sp[1] = 0;
1288 
1289  //non initial quad rows
1290  for (ui32 y = 2; y < height; y += 2)
1291  {
1292  c_q = 0; // context
1293  ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1294 
1295  for (ui32 x = 0; x < width; sp += 4)
1296  {
1297  // decode VLC
1299 
1300  // sigma_q (n, ne, nf)
1301  c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1302  c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1303 
1304  // first quad
1305  vlc_val = rev_fetch(&vlc);
1306 
1307  //decode VLC using the context c_q and the head of VLC bitstream
1308  ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1309 
1310  // if context is zero, use one MEL event
1311  if (c_q == 0) //zero context
1312  {
1313  run -= 2; //subtract 2, since events number is multiplied by 2
1314 
1315  // Is the run terminated in 1? if so, use decoded VLC code,
1316  // otherwise, discard decoded data, since we will decoded again
1317  // using a different context
1318  t0 = (run == -1) ? t0 : 0;
1319 
1320  // is run -1 or -2? this means a run has been consumed
1321  if (run < 0)
1322  run = mel_get_run(&mel); // get another run
1323  }
1324  //run -= (c_q == 0) ? 2 : 0;
1325  //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1326  //if (run < 0)
1327  // run = mel_get_run(&mel); // get another run
1328  sp[0] = t0;
1329  x += 2;
1330 
1331  // prepare context for the next quad; eqn. 2 in ITU T.814
1332  // sigma_q (w, sw)
1333  c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1334  // sigma_q (nw)
1335  c_q |= sp[0 - (si32)sstr] & 0x80;
1336  // sigma_q (n, ne, nf)
1337  c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1338  c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1339 
1340  //remove data from vlc stream (0 bits are removed if vlc is unused)
1341  vlc_val = rev_advance(&vlc, t0 & 0x7);
1342 
1343  //second quad
1344  ui16 t1 = 0;
1345 
1346  //decode VLC using the context c_q and the head of VLC bitstream
1347  t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1348 
1349  // if context is zero, use one MEL event
1350  if (c_q == 0 && x < width) //zero context
1351  {
1352  run -= 2; //subtract 2, since events number if multiplied by 2
1353 
1354  // if event is 0, discard decoded t1
1355  t1 = (run == -1) ? t1 : 0;
1356 
1357  if (run < 0) // have we consumed all events in a run
1358  run = mel_get_run(&mel); // if yes, then get another run
1359  }
1360  t1 = x < width ? t1 : 0;
1361  //run -= (c_q == 0 && x < width) ? 2 : 0;
1362  //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1363  //if (run < 0)
1364  // run = mel_get_run(&mel); // get another run
1365  sp[2] = t1;
1366  x += 2;
1367 
1368  // partial c_q, will be completed when we process the next quad
1369  // sigma_q (w, sw)
1370  c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1371  // sigma_q (nw)
1372  c_q |= sp[2 - (si32)sstr] & 0x80;
1373 
1374  //remove data from vlc stream, if qinf is not used, cwdlen is 0
1375  vlc_val = rev_advance(&vlc, t1 & 0x7);
1376 
1377  // decode u
1379  // uvlc_mode is made up of u_offset bits from the quad pair
1380  ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1381  ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1382  //remove total prefix length
1383  vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
1384  uvlc_entry >>= 3;
1385  //extract suffixes for quad 0 and 1
1386  ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1387  ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
1388  vlc_val = rev_advance(&vlc, len);
1389  uvlc_entry >>= 4;
1390  // quad 0 length
1391  len = uvlc_entry & 0x7; // quad 0 suffix length
1392  uvlc_entry >>= 3;
1393  ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
1394  sp[1] = u_q;
1395  u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
1396  sp[3] = u_q;
1397  }
1398  sp[0] = sp[1] = 0;
1399  }
1400  }
1401 
1402  // step2 we decode magsgn
1403  // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1404  // The 32 bit path decode 16 bits data, for which one would think
1405  // 16 bits are enough, because we want to put in the center of the
1406  // bin.
1407  // If you have mmsbp2 equals 16 bit, and reversible coding, and
1408  // no bitplanes are missing, then we can decoding using the 16 bit
1409  // path, but we are not doing this here.
1410  if (mmsbp2 >= 16)
1411  {
1412  // We allocate a scratch row for storing v_n values.
1413  // We have 512 quads horizontally.
1414  // We may go beyond the last entry by up to 4 entries.
1415  // Here we allocate additional 8 entries.
1416  // There are two rows in this structure, the bottom
1417  // row is used to store processed entries.
1418  const int v_n_size = 512 + 8;
1419  ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
1420 
1421  frwd_struct magsgn;
1422  frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1423 
1424  {
1425  ui16 *sp = scratch;
1426  ui32 *vp = v_n_scratch;
1427  ui32 *dp = decoded_data;
1428  vp[0] = 2; // for easy calculation of emax
1429 
1430  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1431  {
1432  //here we process two quads
1433  v128_t w0, w1; // workers
1434  v128_t inf_u_q, U_q;
1435  // determine U_q
1436  {
1437  inf_u_q = wasm_v128_load(sp);
1438  U_q = wasm_u32x4_shr(inf_u_q, 16);
1439 
1440  w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1441  ui32 i = wasm_i8x16_bitmask(w0);
1442  if (i & 0xFF) // only the lower two U_q
1443  return false;
1444  }
1445 
1446  v128_t vn = wasm_i32x4_const(OJPH_REPEAT4(2));
1447  v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1448  v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1449  w0 = wasm_v128_load(vp);
1450  w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1451  w0 = wasm_v128_or(w0, vn);
1452  wasm_v128_store(vp, w0);
1453 
1454  //interleave in ssse3 style
1455 
1456  w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1457  w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1458  row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1459  row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1460  wasm_v128_store(dp, row0);
1461  wasm_v128_store(dp + stride, row1);
1462  }
1463  }
1464 
1465  for (ui32 y = 2; y < height; y += 2)
1466  {
1467  {
1468  // perform 31 - count_leading_zeros(*vp) here
1469  ui32 *vp = v_n_scratch;
1470  const v128_t lut_lo = wasm_i8x16_const(
1471  31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1472  );
1473  const v128_t lut_hi = wasm_i8x16_const(
1474  31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1475  );
1476  const v128_t nibble_mask = wasm_i8x16_const(OJPH_REPEAT16(0x0F));
1477  const v128_t byte_offset8 = wasm_i16x8_const(OJPH_REPEAT8(8));
1478  const v128_t byte_offset16 = wasm_i16x8_const(OJPH_REPEAT8(16));
1479  const v128_t cc = wasm_i32x4_const(OJPH_REPEAT4(31));
1480  for (ui32 x = 0; x <= width; x += 8, vp += 4)
1481  {
1482  v128_t v, t; // workers
1483  v = wasm_v128_load(vp);
1484 
1485  t = wasm_v128_and(nibble_mask, v);
1486  v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1487  t = wasm_i8x16_swizzle(lut_lo, t);
1488  v = wasm_i8x16_swizzle(lut_hi, v);
1489  v = wasm_u8x16_min(v, t);
1490 
1491  t = wasm_u16x8_shr(v, 8);
1492  v = wasm_v128_or(v, byte_offset8);
1493  v = wasm_u8x16_min(v, t);
1494 
1495  t = wasm_u32x4_shr(v, 16);
1496  v = wasm_v128_or(v, byte_offset16);
1497  v = wasm_u8x16_min(v, t);
1498 
1499  v = wasm_i16x8_sub(cc, v);
1500  wasm_v128_store(vp + v_n_size, v);
1501  }
1502  }
1503 
1504  ui32 *vp = v_n_scratch;
1505  ui16 *sp = scratch + (y >> 1) * sstr;
1506  ui32 *dp = decoded_data + y * stride;
1507  vp[0] = 2; // for easy calculation of emax
1508 
1509  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1510  {
1511  //process two quads
1512  v128_t w0, w1; // workers
1513  v128_t inf_u_q, U_q;
1514  // determine U_q
1515  {
1516  v128_t gamma, emax, kappa, u_q; // needed locally
1517 
1518  inf_u_q = wasm_v128_load(sp);
1519  gamma =
1520  wasm_v128_and(inf_u_q, wasm_i32x4_const(OJPH_REPEAT4(0xF0)));
1521  w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(OJPH_REPEAT4(1)));
1522  gamma = wasm_v128_and(gamma, w0);
1523  gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1524 
1525  emax = wasm_v128_load(vp + v_n_size);
1526  w0 = wasm_i32x4_shuffle(emax, wasm_i64x2_const(0,0), 1, 2, 3, 4);
1527  emax = wasm_i16x8_max(w0, emax); // no max_epi32 in ssse3
1528  emax = wasm_v128_andnot(emax, gamma);
1529 
1530  kappa = wasm_i32x4_const(OJPH_REPEAT4(1));
1531  kappa = wasm_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1532 
1533  u_q = wasm_u32x4_shr(inf_u_q, 16);
1534  U_q = wasm_i32x4_add(u_q, kappa);
1535 
1536  w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1537  ui32 i = wasm_i8x16_bitmask(w0);
1538  if (i & 0xFF) // only the lower two U_q
1539  return false;
1540  }
1541 
1542  v128_t vn = wasm_i32x4_const(OJPH_REPEAT4(2));
1543  v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
1544  v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
1545  w0 = wasm_v128_load(vp);
1546  w0 = wasm_v128_and(w0, wasm_i32x4_const(-1,0,0,0));
1547  w0 = wasm_v128_or(w0, vn);
1548  wasm_v128_store(vp, w0);
1549 
1550  //interleave in ssse3 style
1551  w0 = wasm_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
1552  w1 = wasm_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
1553  row0 = wasm_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
1554  row1 = wasm_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
1555  wasm_v128_store(dp, row0);
1556  wasm_v128_store(dp + stride, row1);
1557  }
1558  }
1559  }
1560  else
1561  {
1562  // reduce bitplane by 16 because we now have 16 bits instead of 32
1563  p -= 16;
1564 
1565  // We allocate a scratch row for storing v_n values.
1566  // We have 512 quads horizontally.
1567  // We may go beyond the last entry by up to 8 entries.
1568  // Therefore we allocate additional 8 entries.
1569  // There are two rows in this structure, the bottom
1570  // row is used to store processed entries.
1571  const int v_n_size = 512 + 8;
1572  ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
1573 
1574  frwd_struct magsgn;
1575  frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
1576 
1577  {
1578  ui16 *sp = scratch;
1579  ui16 *vp = v_n_scratch;
1580  ui32 *dp = decoded_data;
1581  vp[0] = 2; // for easy calculation of emax
1582 
1583  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1584  {
1585  //here we process two quads
1586  v128_t w0, w1; // workers
1587  v128_t inf_u_q, U_q;
1588  // determine U_q
1589  {
1590  inf_u_q = wasm_v128_load(sp);
1591  U_q = wasm_u32x4_shr(inf_u_q, 16);
1592 
1593  w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1594  ui32 i = wasm_i8x16_bitmask(w0);
1595  if (i & 0xFF) // only the lower two U_q
1596  return false;
1597  }
1598 
1599  v128_t vn = wasm_i16x8_const(OJPH_REPEAT8(2));
1600  v128_t row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1601  w0 = wasm_v128_load(vp);
1602  w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1603  w0 = wasm_v128_or(w0, vn);
1604  wasm_v128_store(vp, w0);
1605 
1606  //interleave in ssse3 style
1607  w0 = wasm_i8x16_swizzle(row,
1608  wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1609  -1, 0x0908, -1, 0x0D0C));
1610  wasm_v128_store(dp, w0);
1611  w1 = wasm_i8x16_swizzle(row,
1612  wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1613  -1, 0x0B0A, -1, 0x0F0E));
1614  wasm_v128_store(dp + stride, w1);
1615  }
1616  }
1617 
1618  for (ui32 y = 2; y < height; y += 2)
1619  {
1620  {
1621  // perform 15 - count_leading_zeros(*vp) here
1622  ui16 *vp = v_n_scratch;
1623  const v128_t lut_lo = wasm_i8x16_const(
1624  15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
1625  );
1626  const v128_t lut_hi = wasm_i8x16_const(
1627  15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
1628  );
1629  const v128_t nibble_mask = wasm_i8x16_const(OJPH_REPEAT16(0x0F));
1630  const v128_t byte_offset8 = wasm_i16x8_const(OJPH_REPEAT8(8));
1631  const v128_t cc = wasm_i16x8_const(OJPH_REPEAT8(15));
1632  for (ui32 x = 0; x <= width; x += 16, vp += 8)
1633  {
1634  v128_t v, t; // workers
1635  v = wasm_v128_load(vp);
1636 
1637  t = wasm_v128_and(nibble_mask, v);
1638  v = wasm_v128_and(wasm_u16x8_shr(v, 4), nibble_mask);
1639  t = wasm_i8x16_swizzle(lut_lo, t);
1640  v = wasm_i8x16_swizzle(lut_hi, v);
1641  v = wasm_u8x16_min(v, t);
1642 
1643  t = wasm_u16x8_shr(v, 8);
1644  v = wasm_v128_or(v, byte_offset8);
1645  v = wasm_u8x16_min(v, t);
1646 
1647  v = wasm_i16x8_sub(cc, v);
1648  wasm_v128_store(vp + v_n_size, v);
1649  }
1650  }
1651 
1652  ui16 *vp = v_n_scratch;
1653  ui16 *sp = scratch + (y >> 1) * sstr;
1654  ui32 *dp = decoded_data + y * stride;
1655  vp[0] = 2; // for easy calculation of emax
1656 
1657  for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1658  {
1659  //process two quads
1660  v128_t w0, w1; // workers
1661  v128_t inf_u_q, U_q;
1662  // determine U_q
1663  {
1664  v128_t gamma, emax, kappa, u_q; // needed locally
1665 
1666  inf_u_q = wasm_v128_load(sp);
1667  gamma =
1668  wasm_v128_and(inf_u_q, wasm_i32x4_const(OJPH_REPEAT4(0xF0)));
1669  w0 = wasm_i32x4_sub(gamma, wasm_i32x4_const(OJPH_REPEAT4(1)));
1670  gamma = wasm_v128_and(gamma, w0);
1671  gamma = wasm_i32x4_eq(gamma, wasm_i64x2_const(0, 0));
1672 
1673  emax = wasm_v128_load(vp + v_n_size);
1674  w0 = wasm_i16x8_shuffle(emax,
1675  wasm_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8);
1676  emax = wasm_i16x8_max(w0, emax); // no max_epi32 in ssse3
1677  emax = wasm_i8x16_swizzle(emax,
1678  wasm_i16x8_const(0x0100, -1, 0x0302, -1,
1679  0x0504, -1, 0x0706, -1));
1680  emax = wasm_v128_andnot(emax, gamma);
1681 
1682  kappa = wasm_i32x4_const(OJPH_REPEAT4(1));
1683  kappa = wasm_i16x8_max(emax, kappa); // no max_epi32 in ssse3
1684 
1685  u_q = wasm_u32x4_shr(inf_u_q, 16);
1686  U_q = wasm_i32x4_add(u_q, kappa);
1687 
1688  w0 = wasm_i32x4_gt(U_q, wasm_u32x4_splat(mmsbp2));
1689  ui32 i = wasm_i8x16_bitmask(w0);
1690  if (i & 0xFF) // only the lower two U_q
1691  return false;
1692  }
1693 
1694  v128_t vn = wasm_i16x8_const(OJPH_REPEAT8(2));
1695  v128_t row = decode_two_quad16(inf_u_q, U_q, &magsgn, p, vn);
1696  w0 = wasm_v128_load(vp);
1697  w0 = wasm_v128_and(w0, wasm_i16x8_const(-1,0,0,0,0,0,0,0));
1698  w0 = wasm_v128_or(w0, vn);
1699  wasm_v128_store(vp, w0);
1700 
1701  w0 = wasm_i8x16_swizzle(row,
1702  wasm_i16x8_const(-1, 0x0100, -1, 0x0504,
1703  -1, 0x0908, -1, 0x0D0C));
1704  wasm_v128_store(dp, w0);
1705  w1 = wasm_i8x16_swizzle(row,
1706  wasm_i16x8_const(-1, 0x0302, -1, 0x0706,
1707  -1, 0x0B0A, -1, 0x0F0E));
1708  wasm_v128_store(dp + stride, w1);
1709  }
1710  }
1711 
1712  // increase bitplane back by 16 because we need to process 32 bits
1713  p += 16;
1714  }
1715 
1716  if (num_passes > 1)
1717  {
1718  // We use scratch again, we can divide it into multiple regions
1719  // sigma holds all the significant samples, and it cannot
1720  // be modified after it is set. it will be used during the
1721  // Magnitude Refinement Pass
1722  ui16* const sigma = scratch;
1723 
1724  ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1725  // ui16 contains 4 columns
1726  mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1727 
1728  // We re-arrange quad significance, where each 4 consecutive
1729  // bits represent one quad, into column significance, where,
1730  // each 4 consequtive bits represent one column of 4 rows
1731  {
1732  ui32 y;
1733 
1734  const v128_t mask_3 = wasm_i32x4_const(OJPH_REPEAT4(0x30));
1735  const v128_t mask_C = wasm_i32x4_const(OJPH_REPEAT4(0xC0));
1736  const v128_t shuffle_mask = wasm_i32x4_const(0x0C080400,-1,-1,-1);
1737  for (y = 0; y < height; y += 4)
1738  {
1739  ui16* sp = scratch + (y >> 1) * sstr;
1740  ui16* dp = sigma + (y >> 2) * mstr;
1741  for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1742  {
1743  v128_t s0, s1, u3, uC, t0, t1;
1744 
1745  s0 = wasm_v128_load(sp);
1746  u3 = wasm_v128_and(s0, mask_3);
1747  u3 = wasm_u32x4_shr(u3, 4);
1748  uC = wasm_v128_and(s0, mask_C);
1749  uC = wasm_u32x4_shr(uC, 2);
1750  t0 = wasm_v128_or(u3, uC);
1751 
1752  s1 = wasm_v128_load(sp + sstr);
1753  u3 = wasm_v128_and(s1, mask_3);
1754  u3 = wasm_u32x4_shr(u3, 2);
1755  uC = wasm_v128_and(s1, mask_C);
1756  t1 = wasm_v128_or(u3, uC);
1757 
1758  v128_t r = wasm_v128_or(t0, t1);
1759  r = wasm_i8x16_swizzle(r, shuffle_mask);
1760 
1761  wasm_v128_store32_lane(dp, r, 0);
1762  }
1763  dp[0] = 0; // set an extra entry on the right with 0
1764  }
1765  {
1766  // reset one row after the codeblock
1767  ui16* dp = sigma + (y >> 2) * mstr;
1768  v128_t zero = wasm_i64x2_const(0, 0);
1769  for (ui32 x = 0; x < width; x += 32, dp += 8)
1770  wasm_v128_store(dp, zero);
1771  dp[0] = 0; // set an extra entry on the right with 0
1772  }
1773  }
1774 
1775  // We perform Significance Propagation Pass here
1776  {
1777  // This stores significance information of the previous
1778  // 4 rows. Significance information in this array includes
1779  // all signicant samples in bitplane p - 1; that is,
1780  // significant samples for bitplane p (discovered during the
1781  // cleanup pass and stored in sigma) and samples that have recently
1782  // became significant (during the SPP) in bitplane p-1.
1783  // We store enough for the widest row, containing 1024 columns,
1784  // which is equivalent to 256 of ui16, since each stores 4 columns.
1785  // We add an extra 8 entries, just in case we need more
1786  ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1787 
1788  frwd_struct sigprop;
1789  frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
1790 
1791  for (ui32 y = 0; y < height; y += 4)
1792  {
1793  ui32 pattern = 0xFFFFu; // a pattern needed samples
1794  if (height - y < 4) {
1795  pattern = 0x7777u;
1796  if (height - y < 3) {
1797  pattern = 0x3333u;
1798  if (height - y < 2)
1799  pattern = 0x1111u;
1800  }
1801  }
1802 
1803  // prev holds sign. info. for the previous quad, together
1804  // with the rows on top of it and below it.
1805  ui32 prev = 0;
1806  ui16 *prev_sig = prev_row_sig;
1807  ui16 *cur_sig = sigma + (y >> 2) * mstr;
1808  ui32 *dpp = decoded_data + y * stride;
1809  for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1810  {
1811  // only rows and columns inside the stripe are included
1812  si32 s = (si32)x + 4 - (si32)width;
1813  s = ojph_max(s, 0);
1814  pattern = pattern >> (s * 4);
1815 
1816  // We first find locations that need to be tested (potential
1817  // SPP members); these location will end up in mbr
1818  // In each iteration, we produce 16 bits because cwd can have
1819  // up to 16 bits of significance information, followed by the
1820  // corresponding 16 bits of sign information; therefore, it is
1821  // sufficient to fetch 32 bit data per loop.
1822 
1823  // Althougth we are interested in 16 bits only, we load 32 bits.
1824  // For the 16 bits we are producing, we need the next 4 bits --
1825  // We need data for at least 5 columns out of 8.
1826  // Therefore loading 32 bits is easier than loading 16 bits
1827  // twice.
1828  ui32 ps = *(ui32*)prev_sig;
1829  ui32 ns = *(ui32*)(cur_sig + mstr);
1830  ui32 u = (ps & 0x88888888) >> 3; // the row on top
1831  if (!stripe_causal)
1832  u |= (ns & 0x11111111) << 3; // the row below
1833 
1834  ui32 cs = *(ui32*)cur_sig;
1835  // vertical integration
1836  ui32 mbr = cs; // this sig. info.
1837  mbr |= (cs & 0x77777777) << 1; //above neighbors
1838  mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1839  mbr |= u;
1840  // horizontal integration
1841  ui32 t = mbr;
1842  mbr |= t << 4; // neighbors on the left
1843  mbr |= t >> 4; // neighbors on the right
1844  mbr |= prev >> 12; // significance of previous group
1845 
1846  // remove outside samples, and already significant samples
1847  mbr &= pattern;
1848  mbr &= ~cs;
1849 
1850  // find samples that become significant during the SPP
1851  ui32 new_sig = mbr;
1852  if (new_sig)
1853  {
1854  v128_t cwd_vec = frwd_fetch<0>(&sigprop);
1855  ui32 cwd = wasm_u32x4_extract_lane(cwd_vec, 0);
1856 
1857  ui32 cnt = 0;
1858  ui32 col_mask = 0xFu;
1859  ui32 inv_sig = ~cs & pattern;
1860  for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1861  {
1862  if ((col_mask & new_sig) == 0)
1863  continue;
1864 
1865  //scan one column
1866  ui32 sample_mask = 0x1111u & col_mask;
1867  if (new_sig & sample_mask)
1868  {
1869  new_sig &= ~sample_mask;
1870  if (cwd & 1)
1871  {
1872  ui32 t = 0x33u << i;
1873  new_sig |= t & inv_sig;
1874  }
1875  cwd >>= 1; ++cnt;
1876  }
1877 
1878  sample_mask <<= 1;
1879  if (new_sig & sample_mask)
1880  {
1881  new_sig &= ~sample_mask;
1882  if (cwd & 1)
1883  {
1884  ui32 t = 0x76u << i;
1885  new_sig |= t & inv_sig;
1886  }
1887  cwd >>= 1; ++cnt;
1888  }
1889 
1890  sample_mask <<= 1;
1891  if (new_sig & sample_mask)
1892  {
1893  new_sig &= ~sample_mask;
1894  if (cwd & 1)
1895  {
1896  ui32 t = 0xECu << i;
1897  new_sig |= t & inv_sig;
1898  }
1899  cwd >>= 1; ++cnt;
1900  }
1901 
1902  sample_mask <<= 1;
1903  if (new_sig & sample_mask)
1904  {
1905  new_sig &= ~sample_mask;
1906  if (cwd & 1)
1907  {
1908  ui32 t = 0xC8u << i;
1909  new_sig |= t & inv_sig;
1910  }
1911  cwd >>= 1; ++cnt;
1912  }
1913  }
1914 
1915  if (new_sig)
1916  {
1917  // Spread new_sig, such that each bit is in one byte with a
1918  // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1919  v128_t new_sig_vec = wasm_i16x8_splat((si16)new_sig);
1920  new_sig_vec = wasm_i8x16_swizzle(new_sig_vec,
1921  wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1922  new_sig_vec = wasm_v128_and(new_sig_vec,
1923  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1924  new_sig_vec = wasm_i8x16_eq(new_sig_vec,
1925  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1926 
1927  // find cumulative sums
1928  // to find which bit in cwd we should extract
1929  v128_t ex_sum, shfl, inc_sum = new_sig_vec; // inclusive scan
1930  inc_sum = wasm_i8x16_abs(inc_sum); // cvrt to 0 or 1
1931  shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1932  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1933  inc_sum = wasm_i8x16_add(inc_sum, shfl);
1934  shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
1935  7, 8, 9, 10, 11, 12, 13, 14);
1936  inc_sum = wasm_i8x16_add(inc_sum, shfl);
1937  shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
1938  3, 4, 5, 6);
1939  inc_sum = wasm_i8x16_add(inc_sum, shfl);
1940  shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
1941  1, 2);
1942  inc_sum = wasm_i8x16_add(inc_sum, shfl);
1943  cnt += wasm_u8x16_extract_lane(inc_sum, 15);
1944  // exclusive scan
1945  ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
1946  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
1947 
1948  // Spread cwd, such that each bit is in one byte
1949  // with a value of 0 or 1.
1950  cwd_vec = wasm_i16x8_splat((si16)cwd);
1951  cwd_vec = wasm_i8x16_swizzle(cwd_vec,
1952  wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
1953  cwd_vec = wasm_v128_and(cwd_vec,
1954  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1955  cwd_vec = wasm_i8x16_eq(cwd_vec,
1956  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
1957  cwd_vec = wasm_i8x16_abs(cwd_vec);
1958 
1959  // Obtain bit from cwd_vec correspondig to ex_sum
1960  // Basically, collect needed bits from cwd_vec
1961  v128_t v = wasm_i8x16_swizzle(cwd_vec, ex_sum);
1962 
1963  // load data and set spp coefficients
1964  v128_t m = wasm_i8x16_const(
1965  0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1);
1966  v128_t val = wasm_i32x4_splat(3 << (p - 2));
1967  ui32 *dp = dpp;
1968  for (int c = 0; c < 4; ++ c) {
1969  v128_t s0, s0_ns, s0_val;
1970  // load coefficients
1971  s0 = wasm_v128_load(dp);
1972 
1973  // epi32 is -1 only for coefficient that
1974  // are changed during the SPP
1975  s0_ns = wasm_i8x16_swizzle(new_sig_vec, m);
1976  s0_ns = wasm_i32x4_eq(s0_ns,
1977  wasm_i32x4_const(OJPH_REPEAT4(0xFF)));
1978 
1979  // obtain sign for coefficients in SPP
1980  s0_val = wasm_i8x16_swizzle(v, m);
1981  s0_val = wasm_i32x4_shl(s0_val, 31);
1982  s0_val = wasm_v128_or(s0_val, val);
1983  s0_val = wasm_v128_and(s0_val, s0_ns);
1984 
1985  // update vector
1986  s0 = wasm_v128_or(s0, s0_val);
1987  // store coefficients
1988  wasm_v128_store(dp, s0);
1989  // prepare for next row
1990  dp += stride;
1991  m = wasm_i32x4_add(m, wasm_i32x4_const(OJPH_REPEAT4(1)));
1992  }
1993  }
1994  frwd_advance(&sigprop, cnt);
1995  }
1996 
1997  new_sig |= cs;
1998  *prev_sig = (ui16)(new_sig);
1999 
2000  // vertical integration for the new sig. info.
2001  t = new_sig;
2002  new_sig |= (t & 0x7777) << 1; //above neighbors
2003  new_sig |= (t & 0xEEEE) >> 1; //below neighbors
2004  // add sig. info. from the row on top and below
2005  prev = new_sig | u;
2006  // we need only the bits in 0xF000
2007  prev &= 0xF000;
2008  }
2009  }
2010  }
2011 
2012  // We perform Magnitude Refinement Pass here
2013  if (num_passes > 2)
2014  {
2015  rev_struct magref;
2016  rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
2017 
2018  for (ui32 y = 0; y < height; y += 4)
2019  {
2020  ui16 *cur_sig = sigma + (y >> 2) * mstr;
2021  ui32 *dpp = decoded_data + y * stride;
2022  for (ui32 i = 0; i < width; i += 4, dpp += 4)
2023  {
2024  //Process one entry from sigma array at a time
2025  // Each nibble (4 bits) in the sigma array represents 4 rows,
2026  ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
2027  ui16 sig = *cur_sig++; // 16 bit that will be processed now
2028  int total_bits = 0;
2029  if (sig) // if any of the 32 bits are set
2030  {
2031  // We work on 4 rows, with 4 samples each, since
2032  // data is 32 bit (4 bytes)
2033 
2034  // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
2035  v128_t sig_vec = wasm_i16x8_splat((si16)sig);
2036  sig_vec = wasm_i8x16_swizzle(sig_vec,
2037  wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2038  sig_vec = wasm_v128_and(sig_vec,
2039  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2040  sig_vec = wasm_i8x16_eq(sig_vec,
2041  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2042  sig_vec = wasm_i8x16_abs(sig_vec);
2043 
2044  // find cumulative sums
2045  // to find which bit in cwd we should extract
2046  v128_t ex_sum, shfl, inc_sum = sig_vec; // inclusive scan
2047  shfl = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2048  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2049  inc_sum = wasm_i8x16_add(inc_sum, shfl);
2050  shfl = wasm_i16x8_shuffle(wasm_i64x2_const(0,0), inc_sum,
2051  7, 8, 9, 10, 11, 12, 13, 14);
2052  inc_sum = wasm_i8x16_add(inc_sum, shfl);
2053  shfl = wasm_i32x4_shuffle(wasm_i64x2_const(0,0), inc_sum,
2054  3, 4, 5, 6);
2055  inc_sum = wasm_i8x16_add(inc_sum, shfl);
2056  shfl = wasm_i64x2_shuffle(wasm_i64x2_const(0,0), inc_sum,
2057  1, 2);
2058  inc_sum = wasm_i8x16_add(inc_sum, shfl);
2059  total_bits = wasm_u8x16_extract_lane(inc_sum, 15);
2060  // exclusive scan
2061  ex_sum = wasm_i8x16_shuffle(wasm_i64x2_const(0,0), inc_sum,
2062  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
2063 
2064  // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
2065  // cwd_vec. Then, convert these to a form suitable
2066  // for coefficient modifications; in particular, a value
2067  // of 0 is presented as binary 11, and a value of 1 is
2068  // represented as binary 01
2069  v128_t cwd_vec = wasm_i16x8_splat((si16)cwd);
2070  cwd_vec = wasm_i8x16_swizzle(cwd_vec,
2071  wasm_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
2072  cwd_vec = wasm_v128_and(cwd_vec,
2073  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2074  cwd_vec = wasm_i8x16_eq(cwd_vec,
2075  wasm_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
2076  cwd_vec =
2077  wasm_i8x16_add(cwd_vec, wasm_i8x16_const(OJPH_REPEAT16(1)));
2078  cwd_vec = wasm_i8x16_add(cwd_vec, cwd_vec);
2079  cwd_vec =
2080  wasm_v128_or(cwd_vec, wasm_i8x16_const(OJPH_REPEAT16(1)));
2081 
2082  // load data and insert the mrp bit
2083  v128_t m = wasm_i8x16_const(0,-1,-1,-1,4,-1,-1,-1,
2084  8,-1,-1,-1,12,-1,-1,-1);
2085  ui32 *dp = dpp;
2086  for (int c = 0; c < 4; ++c) {
2087  v128_t s0, s0_sig, s0_idx, s0_val;
2088  // load coefficients
2089  s0 = wasm_v128_load(dp);
2090  // find significant samples in this row
2091  s0_sig = wasm_i8x16_swizzle(sig_vec, m);
2092  s0_sig = wasm_i8x16_eq(s0_sig, wasm_i64x2_const(0, 0));
2093  // get MRP bit index, and MRP pattern
2094  s0_idx = wasm_i8x16_swizzle(ex_sum, m);
2095  s0_val = wasm_i8x16_swizzle(cwd_vec, s0_idx);
2096  // keep data from significant samples only
2097  s0_val = wasm_v128_andnot(s0_val, s0_sig);
2098  // move mrp bits to correct position, and employ
2099  s0_val = wasm_i32x4_shl(s0_val, p - 2);
2100  s0 = wasm_v128_xor(s0, s0_val);
2101  // store coefficients
2102  wasm_v128_store(dp, s0);
2103  // prepare for next row
2104  dp += stride;
2105  m = wasm_i32x4_add(m, wasm_i32x4_const(OJPH_REPEAT4(1)));
2106  }
2107  }
2108  // consume data according to the number of bits set
2109  rev_advance_mrp(&magref, (ui32)total_bits);
2110  }
2111  }
2112  }
2113  }
2114 
2115  return true;
2116  }
2117  }
2118 }
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
bool ojph_decode_codeblock_wasm(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
Decodes one codeblock, processing the cleanup, siginificance propagation, and magnitude refinement pa...
static ui32 rev_fetch(rev_struct *vlcp)
Retrieves 32 bits from the head of a rev_struct structure.
static void rev_init_mrp(rev_struct *mrp, ui8 *data, int lcup, int len2)
Initialized rev_struct structure for MRP segment, and reads a number of bytes such that the next 32 b...
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static void frwd_advance(frwd_struct *msp, ui32 num_bits)
Consume num_bits bits from the bitstream of frwd_struct.
static __m128i decode_two_quad16(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes twos consecutive quads (one octet), using 16 bit data
static void rev_read_mrp(rev_struct *mrp)
Reads and unstuffs from rev_struct.
static ui32 rev_fetch_mrp(rev_struct *mrp)
Retrieves 32 bits from the head of a rev_struct structure.
static ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void rev_read(rev_struct *vlcp)
Read and unstuff data from a backwardly-growing segment.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void rev_init(rev_struct *vlcp, ui8 *data, int lcup, int scup)
Initiates the rev_struct structure and reads a few bytes to move the read address to multiple of 4.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
Consumes num_bits from a rev_struct structure.
static void frwd_read(frwd_struct *msp)
Read and unstuffs 32 bits from forward-growing bitstream.
static ui32 frwd_fetch(frwd_struct *msp)
Fetches 32 bits from the frwd_struct bitstream.
static void frwd_init(frwd_struct *msp, const ui8 *data, int size)
Initialize frwd_struct struct and reads some bytes.
static __m128i decode_one_quad32(const __m128i inf_u_q, __m128i U_q, frwd_struct *magsgn, ui32 p, __m128i &vn)
decodes one quad, using 32 bit data
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
uint64_t ui64
Definition: ojph_defs.h:56
uint16_t ui16
Definition: ojph_defs.h:52
static ui32 count_leading_zeros(ui32 val)
Definition: ojph_arch.h:109
int32_t si32
Definition: ojph_defs.h:55
int16_t si16
Definition: ojph_defs.h:53
uint32_t ui32
Definition: ojph_defs.h:54
uint8_t ui8
Definition: ojph_defs.h:50
#define OJPH_REPEAT2(a)
Macros that help with typing and space.
#define OJPH_REPEAT4(a)
#define OJPH_REPEAT16(a)
#define OJPH_REPEAT8(a)
#define ojph_max(a, b)
Definition: ojph_defs.h:73
#define OJPH_WARN(t,...)
Definition: ojph_message.h:128
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)
State structure for reading and unstuffing of forward-growing bitstreams; these are: MagSgn and SPP b...
const ui8 * data
pointer to bitstream
ui32 bits
number of bits stored in tmp
ui64 tmp
temporary buffer of read data
ui32 unstuff
1 if a bit needs to be unstuffed from next byte
A structure for reading and unstuffing a segment that grows backward, such as VLC and MRP.
ui32 bits
number of bits stored in tmp
int size
number of bytes left
ui8 * data
pointer to where to read data
ui64 tmp
temporary buffer of read data