blob: 7820106e82af0f56934da6c66b953a4990be2575 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001/*
2 * Copyright (c) 2020 Arm Limited. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <stdint.h>
22#include <stdbool.h>
23#include <string.h>
24#include <assert.h>
25#include <math.h>
26#include <stdarg.h>
27#include <math.h>
28#include "mlw_common.h"
29#include "mlw_encode.h"
30
31#define DPRINTF(...)
32//#define DPRINTF(...) printf(__VA_ARGS__)
33
34#define ZERO_RUN_THRES 4
35
36#define min(a,b) ((a)<(b)?(a):(b))
37#define max(a,b) ((a)>(b)?(a):(b))
38
39typedef struct palette {
40 int16_t lut[32];
41 int16_t inv_lut[512];
42 int palsize; // number of palette entries
43 int palbits; // bit width of palette entries
44 int use_zero_runs; // zeros are coded separately
45 int only_palette; // no values outside the palette
46 int direct_offset; // added to the decoded weight index before direct conversion to sign/mag
47 int only_zeros; // special case that the section is all zeros
48} palette_t;
49
50static int is_power_of_two( int x ) {
51 return ((x-1) & x)==0;
52}
53
54static int get_palette_index_bits( int size ) {
55 int i;
56 for(i=7; i>=0; i--)
57 if (size > (1<<i) )
58 return i+1;
59 return 0;
60}
61
62// Search the stream for suitable palette restart positions
63// Return the number of restarts
64static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
65 int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
66 int prev_idx[512]; // For each value, keep track of the index of the previous occurence
67 int *restart_pos;
Tim Hallbb330392020-04-29 15:13:00 +010068 int max_palettes = (size+63)/64;
Tim Hall79d07d22020-04-27 18:20:16 +010069
70 // Preliminary allocation of sufficient size
71 restart_pos = (int*)malloc( max_palettes*sizeof(int) );
72 last_restart_idx=0;
73 got_palette=0;
74 restart_i=1;
75 restart_pos[0] = 0;
76 zero_cnt=0;
77 memset( prev_idx, -1, sizeof(prev_idx));
78 for(i=0; i<size; i++) {
79 // Guess if zeros should be excluded from the palette
80 int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
81
82 if (got_palette) {
83 // Check if the next value is not covered by the current palette
84 if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
85 // New value: increase the palette size
86 palette_size++;
87 DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
88 if ( is_power_of_two(palette_size-1-exclude_zero) ) {
89 if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
90 // create a new palette because we extend a long lasting palette to require one more index bit
91 DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
Tim Hallbb330392020-04-29 15:13:00 +010092 if (restart_i == max_palettes) {
93 max_palettes = max_palettes*2;
94 restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
95 }
Tim Hall79d07d22020-04-27 18:20:16 +010096 DPRINTF("restart %d pos %d\n", restart_i, i);
97 restart_pos[restart_i++] = i;
98 last_restart_idx = i;
99 got_palette=0;
100 zero_cnt=0;
101 }
102 }
103 }
104 }
105
106 prev_idx[ buf[i]+256 ] = i;
107 if (buf[i]==0)
108 zero_cnt++;
109
110 static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
111 int k;
112 // loop over window sizes
113 for(k=0; k<5; k++) {
114 // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
115 int N = window_sizes[k][0] * (got_palette?2:1);
116 if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
117 // Search backward to the position N nonzero values earlier
118 int nzcnt=0;
119 for( j=i; j>last_restart_idx; j--) {
120 if ( buf[j]!=0 ) {
121 if (nzcnt==N+1)
122 break;
123 nzcnt++;
124 }
125 }
126 int restart_idx = j;
127
128 // Calculate the size of a new palette (starting at restart_idx)
129 int new_palette_size=0;
130 for(j=0; j<512; j++) {
131 if ( prev_idx[j] >= restart_idx ) {
132 new_palette_size++;
133 }
134 }
135
136 int create_new_palette=0;
137 if (got_palette) {
138 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
139 int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
140 int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
141 if ( savings>0 ) {
142 // Create new palette because it can be smaller than the existing palette
143 create_new_palette=1;
144 DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
145 }
146 } else {
147 if ( (new_palette_size-exclude_zero) <= 32) {
148 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
149 // estimate if we will make savings by using palette mode
150 int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
151 create_new_palette = savings>0;
152 }
153 }
154 if (create_new_palette) {
155 palette_size=new_palette_size;
156 got_palette=1;
157 last_restart_idx = restart_idx;
158 DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
159 if ( restart_pos[restart_i-1] != last_restart_idx) {
Tim Hallbb330392020-04-29 15:13:00 +0100160 if (restart_i == max_palettes) {
161 max_palettes = max_palettes*2;
162 restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
163 }
Tim Hall79d07d22020-04-27 18:20:16 +0100164 restart_pos[restart_i++] = last_restart_idx;
165 }
166 zero_cnt=0;
167 for( j=last_restart_idx; j<=i; j++)
168 if (buf[j]==0)
169 zero_cnt++;
170 }
171 }
172 }
173 }
174 // Reallocate to actual size
175 *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
176 return restart_i;
177}
178
179// Calculate frequency table
180static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
181 int i;
182 memset(freq, 0, 512*sizeof(int));
183 for(i=0; i<size; i++) {
184 freq[buf[i]+256]++;
185 }
186}
187
188static int cmp_uint64(const void * a, const void * b) {
189 uint64_t aa = *(uint64_t*)a;
190 uint64_t bb = *(uint64_t*)b;
191 return aa>bb ? -1 : aa<bb ? 1 : 0;
192}
193
194// Create palette from the given frequencies
195// Freq index 0-511 correspond to weights -256..255
196static void create_palette( int freq[512],
197 int use_zero_runs,
198 palette_t *p ) {
199 uint64_t freq64[512];
200 int i,all_cnt,all_max_val;
201
202 // Pair the frequency with the value so that
203 // the array can be sorted on frequency while keeping
204 // track of the corresponding palette value
205 memset(freq64, 0, sizeof(freq64));
206 all_cnt=0;
207 all_max_val=0;
208 for(i=-255; i<256; i++) {
209 if (i==0 && use_zero_runs)
210 continue;
211 int sign = i<0;
212 int mag = abs(i);
213 int palval = (mag<<1) | sign;
214
215 // Store palette value in 16 LSB bits, which will not affect the sorting
216 freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
217 all_cnt+=freq[i+256];
218
219 if (freq[i+256]>0) {
220 all_max_val = max(all_max_val, palval);
221 }
222 }
223
224 // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
225 for(i=0; i<31; i++) {
226 if ((freq64[i]>>16)!=0)
227 break;
228 }
229 p->direct_offset = i;
230
231 // Sort in descending frequency order
232 qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
233
234 // Identify special case that there are no weights to code
235 // in the weight index stream (i.e. all weights are zeros)
236 p->only_zeros = (freq64[0]>>16)==0;
237 if (p->only_zeros) {
238 p->direct_offset=0;
239 }
240
241 // Check if all weights fit into the palette (and the palette is not empty)
242 p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
243
244 int max_palette_size;
245 if (p->only_palette) {
246 max_palette_size = 32;
247 } else {
248 // For direct-lut we must make sure that the encoded weight
249 // index is not > 511. We do that by limiting the palette size
250 // such that the greatest value can be reached after subtracting
251 // the palette size.
252 max_palette_size = min(32, 511-all_max_val);
253 if (max_palette_size==1) {
254 max_palette_size=0; // because palette of size 1 is not supported
255 }
256 }
257
258 // Setup the 32 entry palette
259 int palette_max_val = 0, val, cnt, pal_cnt=0;
260 for(i=0; i<max_palette_size; i++) {
261 cnt = freq64[i]>>16;
262 val = freq64[i]&0xffff;
263 if ( cnt==0 )
264 break;
265 p->lut[i] = val;
266 palette_max_val = max(palette_max_val, val);
267 pal_cnt+=cnt;
268 }
269 if (i==1)
270 i++; // palette size of 1 is not supported, make it 2
271
272 // Heuristic for when to use the palette. If more than half of the
273 // weights are in the palette then we use it. This ensures we don't
274 // use palette for e.g. rectangular distributions.
275 int palbits_val;
276 if (pal_cnt > all_cnt/2) {
277 p->palsize = i;
278 palbits_val = palette_max_val;
279 } else {
280 // No palette
281 p->palsize = 0;
282 // If no palette, then palbits is used to specify the
283 // number of bits required for uncompressed mode, i.e.
284 // the number of bits for the greatest weight value
285 palbits_val = all_max_val;
286 }
287
288 // the palette entry bit width
289 // minimum 2bits (because PALBITS is in range 2..9)
290 int palbits=2;
291 while( (1<<palbits) <= palbits_val )
292 palbits++;
293 assert(palbits<=9);
294 p->palbits = palbits;
295 p->use_zero_runs = use_zero_runs;
296}
297
298// Return 1 if zero runs should be used
299// If palette_size is 512, then palette is not used (in that case the palette is setup
300// with the standard alternating unsigned to signed mapping)
301static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
302 int freq[512], i;
303
304 // Calculate frequencies of the given weight stream
305 calc_freq( inbuf, inbuf_size, freq);
306
307 // Find two most common values
308 int most_common_freq[2]={0}, most_common_val[2]={0};
309 for(i=0; i<512; i++) {
310 if ( freq[i] > most_common_freq[0] ) {
311 most_common_freq[1] = most_common_freq[0];
312 most_common_val[1] = most_common_val[0];
313 most_common_freq[0] = freq[i];
314 most_common_val[0] = i-256;
315 } else if ( freq[i] > most_common_freq[1] ) {
316 most_common_freq[1] = freq[i];
317 most_common_val[1] = i-256;
318 }
319 }
320
321 // Decide if zero-runs (alternating mode) should be used:
322 // * zero should be the most common symbol
323 // * zero should be sufficiently more common than the second most common symbol
324 int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
325
326 // Create the palette
327 create_palette( freq, use_zero_runs, p);
328
329 return use_zero_runs;
330}
331
332static void create_inverse_palette( palette_t *p) {
333 int i;
334 memset( p->inv_lut, 0, sizeof(p->inv_lut));
335 for(i=0; i<512; i++) {
336 int val = i;
337 int sign = val&1;
338 int mag = val>>1;
339 int weight = sign ? -mag : mag;
340 if (weight+256 < 512)
341 p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset;
342 }
343 for(i=0; i<p->palsize; i++) {
344 int val = p->lut[i];
345 int sign = val&1;
346 int mag = val>>1;
347 int weight = sign ? -mag : mag;
348 if (weight+256 < 512)
349 p->inv_lut[ weight+256 ] = i;
350 }
351}
352
353#define NWCFG 13
354#define NZCFG 4 // restrict search to ZDIV=0..3
355#define MAX_ZWCFG (max(NWCFG,NZCFG))
356
357// search state
358typedef struct search_state {
359 int bitcnt; // number of bits to reach this state
360 uint8_t prev_cfg; // previous grc parameter config
361} search_state_t;
362
363// (trunc<<4) | div, 0x20 means uncompressed
364static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
365static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
366
367
368
369// An algorithm similar to the Viterbi algorithm is used to search for a
370// good GRC parameter sequence for the given input value sequence.
371// The inval buffer can contain weights, weight indices or runs.
372// The return value is the resulting number of bitstream sections.
373static int search_grc_params( const int *inval_buf,
374 int n_inval,
375 int zrun_mode,
376 int uncompressed_bits,
377 uint8_t *grc_param_cfg,
378 int *grc_param_pos,
379 int max_grc_param_cfg,
380 int *existing_grc_param_pos,
381 int n_existing_grc_param_pos,
382 int *bitcnt )
383{
384 int n_cfg = zrun_mode ? NZCFG : NWCFG;
385 const char *grc_params = zrun_mode ? z_grc_params : w_grc_params;
386 int i,j;
387
388 search_state_t *state[MAX_ZWCFG];
389 for(i=0; i<n_cfg; i++) {
390 state[i] = malloc( sizeof(search_state_t) * (n_inval+1) );
391 state[i][0].bitcnt=0;
392 state[i][0].prev_cfg=i;
393 }
394
395 // Loop over inval_buf
396 int existing_idx=0;
397 for(i=0; i<n_inval; i++) {
398 int value = inval_buf[i];
399
400 // Best GRC parameter so far
401 int best_bitcnt=0x7fffffff, best_cfg=0;
402 for(j=0; j<n_cfg; j++) {
403 if (state[j][i].bitcnt < best_bitcnt) {
404 best_bitcnt = state[j][i].bitcnt;
405 best_cfg = j;
406 }
407 }
408
409 int cmd_cost = 40;
410 if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
411 // free transition, because the weight stream already inserted a command at this position
412 cmd_cost = 0;
413 existing_idx++;
414 }
415
416 // Loop over GRC parameters, calculate bits to code value, and then update the search state
417 for(j=0; j<n_cfg; j++) {
418 int div = grc_params[j]&15;
419 int trunc = grc_params[j]>>4;
420 int q = value>>div;
421 int bits = trunc ? min(q+1,2) + div : q+1+div;
422 if (!zrun_mode && ((trunc && q>2) || q>31))
423 bits=10000; // it's not possible to code the current value; give it a high cost
424 if (trunc==2)
425 bits=uncompressed_bits;
426
427 if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
428 // Change GRC parameters
429 state[j][i+1].prev_cfg = best_cfg;
430 state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits;
431 } else {
432 // Keep same GRC parameters
433 state[j][i+1].prev_cfg = j;
434 state[j][i+1].bitcnt = state[j][i].bitcnt + bits;
435 }
436 }
437 }
438
439
440 // Best GRC parameter
441 int best_bitcnt=0x7fffffff, best_cfg=0;
442 for(j=0; j<n_cfg; j++) {
443 if (state[j][n_inval].bitcnt < best_bitcnt) {
444 best_bitcnt = state[j][n_inval].bitcnt;
445 best_cfg = j;
446 }
447 }
448
449 int cfg = best_cfg;
450 int n_cmds=0;
451 for(i=n_inval; i>=0; i--) {
452 if (state[cfg][i].prev_cfg != cfg || i==0) {
453 n_cmds++;
454 cfg = state[cfg][i].prev_cfg;
455 }
456 }
457
458 (void)(max_grc_param_cfg);
459 assert(n_cmds<=max_grc_param_cfg);
460
461 cfg = best_cfg;
462 j=n_cmds-1;
463 int endpos=n_inval;
464 for(i=n_inval; i>=0; i--) {
465 if (state[cfg][i].prev_cfg != cfg || i==0) {
466 grc_param_cfg[j] = cfg;
467 grc_param_pos[j] = endpos;
468 j--;
469 cfg = state[cfg][i].prev_cfg;
470 endpos = i-1;
471 }
472 }
473 assert(j==-1);
474
475 for(i=0; i<n_cfg; i++) {
476 free(state[i]);
477 }
478
479 *bitcnt = best_bitcnt;
480
481 return n_cmds;
482}
483
484
485/////////////////////////////// Write to bitstream
486
487typedef struct bitbuf {
488 uint8_t *buf;
489 int buf_size; // in bytes
490 int pos; // bit pos of next bit
491 int log_symbols;
492} bitbuf_t;
493
494// size in byte
495static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
496 bb->buf = buf;
497 bb->pos = 0;
498 bb->buf_size = size;
499 bb->log_symbols = log_symbols;
500}
501
502static void bitbuf_putbit( bitbuf_t *bb, int bit) {
503 int byte_pos = bb->pos>>3;
504 int bit_pos = bb->pos&7;
505 assert( byte_pos >= 0 );
506 assert( byte_pos < bb->buf_size );
507 bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<<bit_pos)) | (bit<<bit_pos);
508 bb->pos += 1;
509}
510
511static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
512 int i;
513 if (len>0) {
514 if (bb->log_symbols)
515 printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
516 for(i=0; i<len; i++) {
517 bitbuf_putbit(bb, (data>>i)&1);
518 }
519 }
520}
521
522// Return new bitpos
523static int encode_slice( const int *w_value,
524 const int *z_value,
525 int nvalues,
526 palette_t *p,
527 int new_palette,
528 int uncompressed_bits,
529 int w_cfg,
530 int z_cfg,
531 uint8_t *bitbuf,
532 int bitbuf_size,
533 int bitpos,
534 int verbose )
535{
536 int i,j;
537 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
538 bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
539 bb->pos = bitpos;
540
541 assert(nvalues<32768);
542 // GRC parameters for this slice
543 int w_grc_div = w_grc_params[w_cfg] & 15;
544 int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1;
545 int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2;
546 int z_grc_div = z_grc_params[z_cfg] & 15;
547
548 if (w_uncompressed) {
549 w_grc_div = uncompressed_bits;
550 }
551
552 int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
553 int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
554
555 if (verbose&1) {
556 printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
557 bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
558 }
559
560 // Write slice header
561 bitbuf_put( bb, "ZDIV", 3, zdiv);
562 bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
563 bitbuf_put( bb, "WDIV", 3, wdiv);
564 bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
565 bitbuf_put( bb, "NEWPAL", 1, new_palette );
566 if (new_palette) {
567 bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
568 bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
569 bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
570 for(i=0; i<p->palsize; i++) {
571 bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
572 }
573 }
574
575 int z_nvalues = nvalues + (new_palette?1:0);
576 int w_pos=0, z_pos=0;
577 int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
578 int z_unary=0, z_q=-1, z_r=0;
579 int w_nsymbols=0, w_remain[12]={0};
580 int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
581 int z_nsymbols=0, z_remain[12]={0};
582 int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
583 int z_unary_len = z_grc_div<3 ? 12 : 8;
584 do {
585 int balance = p->use_zero_runs ? w_pos - z_pos : 0;
586 int w_enable = balance<8 && w_pos<nvalues;
587 int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
588 if (w_enable) {
589 // Encode chunk (weights)
590 j=0;
591 w_nsymbols=0;
592 w_unary0=0;
593 w_unary1=0;
594 w_unary1_len=0;
595 int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
596 while(j<max_symbols) {
597 if (w_q<0) {
598 if (w_pos<nvalues) {
599 int value = w_value[w_pos];
600 assert(value<512);
601 w_q = value>>w_grc_div;
602 w_r = value&((1<<w_grc_div)-1);
603 assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
604 } else {
605 w_q = 0;
606 w_r = -1; // don't send remainder
607 }
608 }
609 while( w_q>=0 && j<max_symbols) {
610 w_unary0 |= w_q>0 ? (1<<j) : 0;
611 if (w_q>0) {
612 w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
613 w_unary1_len++;
614 }
615 j++;
616 w_q-=2;
617 if (w_grc_trunc)
618 w_q--;
619 }
620 if (w_q<0 && w_r>=0) {
621 w_remain[w_nsymbols] = w_r;
622 w_nsymbols++;
623 w_pos++;
624 }
625 }
626 }
627
628 if (z_enable) {
629 // Encode chunk (zrun)
630 j=0;
631 z_nsymbols=0;
632 z_unary=0;
633 while(j<z_unary_len) {
634 if (z_q<0) {
635 if (z_pos<z_nvalues) {
636 int value = z_value[z_pos];
637 z_q = value>>z_grc_div;
638 z_r = value&((1<<z_grc_div)-1);
639 } else {
640 z_q = 0;
641 z_r = -1;
642 }
643 }
644 while( z_q>=0 && j<z_unary_len) {
645 z_unary |= z_q>0 ? (1<<j) : 0;
646 j++;
647 z_q--;
648 }
649 if (z_q<0 && z_r>=0) {
650 z_remain[z_nsymbols] = z_r;
651 z_nsymbols++;
652 z_pos++;
653 }
654 }
655 }
656
657 // Write chunk to bitstream
658 if (w_enable && !w_uncompressed) {
659 bitbuf_put( bb, "WUNARY0", 12, w_unary0);
660 }
661 if (z_enable) {
662 bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
663 }
664 if (w_enable && !w_uncompressed) {
665 bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
666 }
667 if (w_prev_enable) {
668 for(i=0; i<w_prev_nsymbols; i++) {
669 bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
670 }
671 }
672 if (z_prev_enable) {
673 for(i=0; i<z_prev_nsymbols; i++) {
674 bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
675 }
676 }
677 w_prev_enable = w_enable;
678 w_prev_nsymbols = w_nsymbols;
679 memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
680 z_prev_enable = z_enable;
681 z_prev_nsymbols = z_nsymbols;
682 memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
683 } while( w_prev_enable || z_prev_enable );
684
685 return bb->pos;
686}
687
688
689// return new bitpos
690static int encode_section( const int16_t *inbuf,
691 int size,
692 palette_t *p,
693 uint8_t *bitbuf,
694 int bitbuf_size,
695 int bitpos,
696 int verbose )
697{
698 int uncompressed_bits;
699
700 // Uncompressed mode can only be used if either all weights
701 // are in the palette OR if the palette is not used.
702 if (p->only_palette) {
703 // Uncompressed bits derived from palette size
704 uncompressed_bits=0;
705 while( (1<<uncompressed_bits) < p->palsize )
706 uncompressed_bits++;
707 } else if (p->palsize==0) {
708 // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
709 uncompressed_bits = p->palbits;
710 } else {
711 // Don't use uncompressed
712 uncompressed_bits = 100;
713 }
714
715 int *weight_values = malloc( size*sizeof(int) );
716 int *zrun_values = malloc( size*sizeof(int) );
717
718 // Get weights (or weight indicies) AND zero-runs from the input weight stream.
719 int i=0, n_weights = 0, zcnt;
720 while(1) {
721 if (p->use_zero_runs) {
722 zcnt=0;
723 // Count zero run
724 // Special case: if all weights in the section are zero, we must
725 // still ensure we have one coded weight so the the slice length
726 // doesn't become 0. Therefore we skip the first zero run and code
727 // the zero explicitly as a weight value instead
728 if (!p->only_zeros || i>0) {
729 while( i<size && inbuf[i]==0) {
730 zcnt++;
731 i++;
732 }
733 }
734 zrun_values[n_weights] = zcnt;
735 }
736 if (i==size)
737 break;
738 int value = p->inv_lut[inbuf[i]+256];
739 weight_values[n_weights] = value;
740 n_weights++;
741 i++;
742 }
743
744 // Search for good GRC parameters for the weight stream
745 int n_w_slice, w_bitcnt;
746 uint8_t *w_slice_cfg;
747 int *w_slice_pos;
748 w_slice_cfg = malloc( size );
749 w_slice_pos = malloc( size*sizeof(int) );
750 n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
751 if (n_weights==0)
752 n_w_slice = 0;
753
754 // Search for good GRC parameters for the zrun stream
755 int n_z_slice=0, z_bitcnt=0;
756 uint8_t *z_slice_cfg=0;
757 int *z_slice_pos=0;
758 if (p->use_zero_runs) {
759 z_slice_cfg = malloc( size );
760 z_slice_pos = malloc( size*sizeof(int) );
761 n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
762 }
763
764 // Encode bitstream slice
765 int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
766 while(pos<n_weights || new_palette) {
767 int endpos=pos+32767; // max slice length
768
769 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
770 endpos = w_slice_pos[i_w_slice];
771 }
772
773 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
774 endpos = z_slice_pos[i_z_slice];
775 }
776
777 if (n_weights < endpos) {
778 endpos = n_weights;
779 }
780
781 // The first slice (when new_palette is 1) encodes zero runs both at the
782 // beginning and end (i.e. number of zero runs are len+1).
783 // The following slices only encode zero runs at the end (there cannot be
784 // any zeros in the beginning since they are encoded by the previous slice)
785 int len = endpos - pos;
786 int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
787 bitpos = encode_slice( weight_values+pos, zrun_buf, len,
788 p, new_palette, uncompressed_bits,
789 w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
790 bitbuf, bitbuf_size, bitpos, verbose );
791 new_palette = 0;
792
793 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
794 i_w_slice++;
795 }
796 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
797 i_z_slice++;
798 }
799 pos = endpos;
800 }
801
802 // Free temporary buffers
803 free(w_slice_cfg);
804 free(w_slice_pos);
805 if (p->use_zero_runs) {
806 free(z_slice_cfg);
807 free(z_slice_pos);
808 }
809 free(weight_values);
810 free(zrun_values);
811
812 return bitpos;
813}
814
815// Encode the given weight stream
816// inbuf uncompressed 9bit signed weights
817// inbuf_size number of weights
818// outbuf compressed bitstream, buffer is malloced
819// verbose if non-zero, printf log
820// Return value is the size in bytes of the compressed output
821// Return -1 if error
822int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
823 int i;
824 // Range check
825 for(i=0; i<inbuf_size; i++) {
826 if (inbuf[i]<-255 || inbuf[i]>255) {
827 printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
828 return -1;
829 }
830 }
831
832 int bitbuf_size = inbuf_size*2+1024;
833 *outbuf = malloc( bitbuf_size );
834
835 // Analyse input data to find palette re-programming points
836 int n_restarts;
837 int *palette_restart_pos;
838 n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
839
840 // Compress each section (using a single palette) separately
841 int bitpos=0;
842 for(i=0; i<n_restarts; i++) {
843 palette_t palette;
844 int pos, size;
845 pos = palette_restart_pos[i];
846 size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
847 find_palette( inbuf+pos, size, &palette);
848 create_inverse_palette( &palette);
849 bitpos = encode_section( inbuf+pos, size, &palette,
850 *outbuf, bitbuf_size, bitpos, verbose );
851 }
852
853
854 // Add end of stream marker and align to 128bit
855 {
856 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
857 bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
858 bb->pos = bitpos;
859 bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
860 bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
861
862 // Pad with 0xff until 64bit aligned
863 while( bb->pos & 127 ) {
864 bitbuf_put( bb, "PAD", 8, 0xff );
865 }
866 bitpos = bb->pos;
867 }
868 assert((bitpos&127)==0);
869 int outbuf_size = bitpos/8;
870 *outbuf = realloc( *outbuf, outbuf_size);
871
872 free(palette_restart_pos);
873
874 return outbuf_size;
875}
876
877void mlw_free_outbuf( uint8_t *outbuf ) {
878 if (outbuf)
879 free(outbuf);
880}