blob: ac25fc525baef431279e5e88693e4d5d586622d5 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001/*
2 * Copyright (c) 2020 Arm Limited. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <stdint.h>
22#include <stdbool.h>
23#include <string.h>
24#include <assert.h>
25#include <math.h>
26#include <stdarg.h>
27#include <math.h>
28#include "mlw_common.h"
29#include "mlw_encode.h"
30
31#define DPRINTF(...)
32//#define DPRINTF(...) printf(__VA_ARGS__)
33
34#define ZERO_RUN_THRES 4
35
36#define min(a,b) ((a)<(b)?(a):(b))
37#define max(a,b) ((a)>(b)?(a):(b))
38
39typedef struct palette {
40 int16_t lut[32];
41 int16_t inv_lut[512];
42 int palsize; // number of palette entries
43 int palbits; // bit width of palette entries
44 int use_zero_runs; // zeros are coded separately
45 int only_palette; // no values outside the palette
46 int direct_offset; // added to the decoded weight index before direct conversion to sign/mag
47 int only_zeros; // special case that the section is all zeros
48} palette_t;
49
50static int is_power_of_two( int x ) {
51 return ((x-1) & x)==0;
52}
53
54static int get_palette_index_bits( int size ) {
55 int i;
56 for(i=7; i>=0; i--)
57 if (size > (1<<i) )
58 return i+1;
59 return 0;
60}
61
62// Search the stream for suitable palette restart positions
63// Return the number of restarts
64static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
65 int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
66 int prev_idx[512]; // For each value, keep track of the index of the previous occurence
67 int *restart_pos;
68 int max_palettes = size/64;
69
70 // Preliminary allocation of sufficient size
71 restart_pos = (int*)malloc( max_palettes*sizeof(int) );
72 last_restart_idx=0;
73 got_palette=0;
74 restart_i=1;
75 restart_pos[0] = 0;
76 zero_cnt=0;
77 memset( prev_idx, -1, sizeof(prev_idx));
78 for(i=0; i<size; i++) {
79 // Guess if zeros should be excluded from the palette
80 int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
81
82 if (got_palette) {
83 // Check if the next value is not covered by the current palette
84 if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
85 // New value: increase the palette size
86 palette_size++;
87 DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
88 if ( is_power_of_two(palette_size-1-exclude_zero) ) {
89 if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
90 // create a new palette because we extend a long lasting palette to require one more index bit
91 DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
92 assert( restart_i < max_palettes );
93 DPRINTF("restart %d pos %d\n", restart_i, i);
94 restart_pos[restart_i++] = i;
95 last_restart_idx = i;
96 got_palette=0;
97 zero_cnt=0;
98 }
99 }
100 }
101 }
102
103 prev_idx[ buf[i]+256 ] = i;
104 if (buf[i]==0)
105 zero_cnt++;
106
107 static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
108 int k;
109 // loop over window sizes
110 for(k=0; k<5; k++) {
111 // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
112 int N = window_sizes[k][0] * (got_palette?2:1);
113 if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
114 // Search backward to the position N nonzero values earlier
115 int nzcnt=0;
116 for( j=i; j>last_restart_idx; j--) {
117 if ( buf[j]!=0 ) {
118 if (nzcnt==N+1)
119 break;
120 nzcnt++;
121 }
122 }
123 int restart_idx = j;
124
125 // Calculate the size of a new palette (starting at restart_idx)
126 int new_palette_size=0;
127 for(j=0; j<512; j++) {
128 if ( prev_idx[j] >= restart_idx ) {
129 new_palette_size++;
130 }
131 }
132
133 int create_new_palette=0;
134 if (got_palette) {
135 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
136 int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
137 int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
138 if ( savings>0 ) {
139 // Create new palette because it can be smaller than the existing palette
140 create_new_palette=1;
141 DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
142 }
143 } else {
144 if ( (new_palette_size-exclude_zero) <= 32) {
145 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
146 // estimate if we will make savings by using palette mode
147 int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
148 create_new_palette = savings>0;
149 }
150 }
151 if (create_new_palette) {
152 palette_size=new_palette_size;
153 got_palette=1;
154 last_restart_idx = restart_idx;
155 DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
156 if ( restart_pos[restart_i-1] != last_restart_idx) {
157 assert( restart_i < max_palettes );
158 restart_pos[restart_i++] = last_restart_idx;
159 }
160 zero_cnt=0;
161 for( j=last_restart_idx; j<=i; j++)
162 if (buf[j]==0)
163 zero_cnt++;
164 }
165 }
166 }
167 }
168 // Reallocate to actual size
169 *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
170 return restart_i;
171}
172
173// Calculate frequency table
174static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
175 int i;
176 memset(freq, 0, 512*sizeof(int));
177 for(i=0; i<size; i++) {
178 freq[buf[i]+256]++;
179 }
180}
181
182static int cmp_uint64(const void * a, const void * b) {
183 uint64_t aa = *(uint64_t*)a;
184 uint64_t bb = *(uint64_t*)b;
185 return aa>bb ? -1 : aa<bb ? 1 : 0;
186}
187
188// Create palette from the given frequencies
189// Freq index 0-511 correspond to weights -256..255
190static void create_palette( int freq[512],
191 int use_zero_runs,
192 palette_t *p ) {
193 uint64_t freq64[512];
194 int i,all_cnt,all_max_val;
195
196 // Pair the frequency with the value so that
197 // the array can be sorted on frequency while keeping
198 // track of the corresponding palette value
199 memset(freq64, 0, sizeof(freq64));
200 all_cnt=0;
201 all_max_val=0;
202 for(i=-255; i<256; i++) {
203 if (i==0 && use_zero_runs)
204 continue;
205 int sign = i<0;
206 int mag = abs(i);
207 int palval = (mag<<1) | sign;
208
209 // Store palette value in 16 LSB bits, which will not affect the sorting
210 freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
211 all_cnt+=freq[i+256];
212
213 if (freq[i+256]>0) {
214 all_max_val = max(all_max_val, palval);
215 }
216 }
217
218 // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
219 for(i=0; i<31; i++) {
220 if ((freq64[i]>>16)!=0)
221 break;
222 }
223 p->direct_offset = i;
224
225 // Sort in descending frequency order
226 qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
227
228 // Identify special case that there are no weights to code
229 // in the weight index stream (i.e. all weights are zeros)
230 p->only_zeros = (freq64[0]>>16)==0;
231 if (p->only_zeros) {
232 p->direct_offset=0;
233 }
234
235 // Check if all weights fit into the palette (and the palette is not empty)
236 p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
237
238 int max_palette_size;
239 if (p->only_palette) {
240 max_palette_size = 32;
241 } else {
242 // For direct-lut we must make sure that the encoded weight
243 // index is not > 511. We do that by limiting the palette size
244 // such that the greatest value can be reached after subtracting
245 // the palette size.
246 max_palette_size = min(32, 511-all_max_val);
247 if (max_palette_size==1) {
248 max_palette_size=0; // because palette of size 1 is not supported
249 }
250 }
251
252 // Setup the 32 entry palette
253 int palette_max_val = 0, val, cnt, pal_cnt=0;
254 for(i=0; i<max_palette_size; i++) {
255 cnt = freq64[i]>>16;
256 val = freq64[i]&0xffff;
257 if ( cnt==0 )
258 break;
259 p->lut[i] = val;
260 palette_max_val = max(palette_max_val, val);
261 pal_cnt+=cnt;
262 }
263 if (i==1)
264 i++; // palette size of 1 is not supported, make it 2
265
266 // Heuristic for when to use the palette. If more than half of the
267 // weights are in the palette then we use it. This ensures we don't
268 // use palette for e.g. rectangular distributions.
269 int palbits_val;
270 if (pal_cnt > all_cnt/2) {
271 p->palsize = i;
272 palbits_val = palette_max_val;
273 } else {
274 // No palette
275 p->palsize = 0;
276 // If no palette, then palbits is used to specify the
277 // number of bits required for uncompressed mode, i.e.
278 // the number of bits for the greatest weight value
279 palbits_val = all_max_val;
280 }
281
282 // the palette entry bit width
283 // minimum 2bits (because PALBITS is in range 2..9)
284 int palbits=2;
285 while( (1<<palbits) <= palbits_val )
286 palbits++;
287 assert(palbits<=9);
288 p->palbits = palbits;
289 p->use_zero_runs = use_zero_runs;
290}
291
292// Return 1 if zero runs should be used
293// If palette_size is 512, then palette is not used (in that case the palette is setup
294// with the standard alternating unsigned to signed mapping)
295static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
296 int freq[512], i;
297
298 // Calculate frequencies of the given weight stream
299 calc_freq( inbuf, inbuf_size, freq);
300
301 // Find two most common values
302 int most_common_freq[2]={0}, most_common_val[2]={0};
303 for(i=0; i<512; i++) {
304 if ( freq[i] > most_common_freq[0] ) {
305 most_common_freq[1] = most_common_freq[0];
306 most_common_val[1] = most_common_val[0];
307 most_common_freq[0] = freq[i];
308 most_common_val[0] = i-256;
309 } else if ( freq[i] > most_common_freq[1] ) {
310 most_common_freq[1] = freq[i];
311 most_common_val[1] = i-256;
312 }
313 }
314
315 // Decide if zero-runs (alternating mode) should be used:
316 // * zero should be the most common symbol
317 // * zero should be sufficiently more common than the second most common symbol
318 int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
319
320 // Create the palette
321 create_palette( freq, use_zero_runs, p);
322
323 return use_zero_runs;
324}
325
326static void create_inverse_palette( palette_t *p) {
327 int i;
328 memset( p->inv_lut, 0, sizeof(p->inv_lut));
329 for(i=0; i<512; i++) {
330 int val = i;
331 int sign = val&1;
332 int mag = val>>1;
333 int weight = sign ? -mag : mag;
334 if (weight+256 < 512)
335 p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset;
336 }
337 for(i=0; i<p->palsize; i++) {
338 int val = p->lut[i];
339 int sign = val&1;
340 int mag = val>>1;
341 int weight = sign ? -mag : mag;
342 if (weight+256 < 512)
343 p->inv_lut[ weight+256 ] = i;
344 }
345}
346
347#define NWCFG 13
348#define NZCFG 4 // restrict search to ZDIV=0..3
349#define MAX_ZWCFG (max(NWCFG,NZCFG))
350
351// search state
352typedef struct search_state {
353 int bitcnt; // number of bits to reach this state
354 uint8_t prev_cfg; // previous grc parameter config
355} search_state_t;
356
357// (trunc<<4) | div, 0x20 means uncompressed
358static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
359static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
360
361
362
363// An algorithm similar to the Viterbi algorithm is used to search for a
364// good GRC parameter sequence for the given input value sequence.
365// The inval buffer can contain weights, weight indices or runs.
366// The return value is the resulting number of bitstream sections.
367static int search_grc_params( const int *inval_buf,
368 int n_inval,
369 int zrun_mode,
370 int uncompressed_bits,
371 uint8_t *grc_param_cfg,
372 int *grc_param_pos,
373 int max_grc_param_cfg,
374 int *existing_grc_param_pos,
375 int n_existing_grc_param_pos,
376 int *bitcnt )
377{
378 int n_cfg = zrun_mode ? NZCFG : NWCFG;
379 const char *grc_params = zrun_mode ? z_grc_params : w_grc_params;
380 int i,j;
381
382 search_state_t *state[MAX_ZWCFG];
383 for(i=0; i<n_cfg; i++) {
384 state[i] = malloc( sizeof(search_state_t) * (n_inval+1) );
385 state[i][0].bitcnt=0;
386 state[i][0].prev_cfg=i;
387 }
388
389 // Loop over inval_buf
390 int existing_idx=0;
391 for(i=0; i<n_inval; i++) {
392 int value = inval_buf[i];
393
394 // Best GRC parameter so far
395 int best_bitcnt=0x7fffffff, best_cfg=0;
396 for(j=0; j<n_cfg; j++) {
397 if (state[j][i].bitcnt < best_bitcnt) {
398 best_bitcnt = state[j][i].bitcnt;
399 best_cfg = j;
400 }
401 }
402
403 int cmd_cost = 40;
404 if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
405 // free transition, because the weight stream already inserted a command at this position
406 cmd_cost = 0;
407 existing_idx++;
408 }
409
410 // Loop over GRC parameters, calculate bits to code value, and then update the search state
411 for(j=0; j<n_cfg; j++) {
412 int div = grc_params[j]&15;
413 int trunc = grc_params[j]>>4;
414 int q = value>>div;
415 int bits = trunc ? min(q+1,2) + div : q+1+div;
416 if (!zrun_mode && ((trunc && q>2) || q>31))
417 bits=10000; // it's not possible to code the current value; give it a high cost
418 if (trunc==2)
419 bits=uncompressed_bits;
420
421 if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
422 // Change GRC parameters
423 state[j][i+1].prev_cfg = best_cfg;
424 state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits;
425 } else {
426 // Keep same GRC parameters
427 state[j][i+1].prev_cfg = j;
428 state[j][i+1].bitcnt = state[j][i].bitcnt + bits;
429 }
430 }
431 }
432
433
434 // Best GRC parameter
435 int best_bitcnt=0x7fffffff, best_cfg=0;
436 for(j=0; j<n_cfg; j++) {
437 if (state[j][n_inval].bitcnt < best_bitcnt) {
438 best_bitcnt = state[j][n_inval].bitcnt;
439 best_cfg = j;
440 }
441 }
442
443 int cfg = best_cfg;
444 int n_cmds=0;
445 for(i=n_inval; i>=0; i--) {
446 if (state[cfg][i].prev_cfg != cfg || i==0) {
447 n_cmds++;
448 cfg = state[cfg][i].prev_cfg;
449 }
450 }
451
452 (void)(max_grc_param_cfg);
453 assert(n_cmds<=max_grc_param_cfg);
454
455 cfg = best_cfg;
456 j=n_cmds-1;
457 int endpos=n_inval;
458 for(i=n_inval; i>=0; i--) {
459 if (state[cfg][i].prev_cfg != cfg || i==0) {
460 grc_param_cfg[j] = cfg;
461 grc_param_pos[j] = endpos;
462 j--;
463 cfg = state[cfg][i].prev_cfg;
464 endpos = i-1;
465 }
466 }
467 assert(j==-1);
468
469 for(i=0; i<n_cfg; i++) {
470 free(state[i]);
471 }
472
473 *bitcnt = best_bitcnt;
474
475 return n_cmds;
476}
477
478
479/////////////////////////////// Write to bitstream
480
481typedef struct bitbuf {
482 uint8_t *buf;
483 int buf_size; // in bytes
484 int pos; // bit pos of next bit
485 int log_symbols;
486} bitbuf_t;
487
488// size in byte
489static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
490 bb->buf = buf;
491 bb->pos = 0;
492 bb->buf_size = size;
493 bb->log_symbols = log_symbols;
494}
495
496static void bitbuf_putbit( bitbuf_t *bb, int bit) {
497 int byte_pos = bb->pos>>3;
498 int bit_pos = bb->pos&7;
499 assert( byte_pos >= 0 );
500 assert( byte_pos < bb->buf_size );
501 bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<<bit_pos)) | (bit<<bit_pos);
502 bb->pos += 1;
503}
504
505static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
506 int i;
507 if (len>0) {
508 if (bb->log_symbols)
509 printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
510 for(i=0; i<len; i++) {
511 bitbuf_putbit(bb, (data>>i)&1);
512 }
513 }
514}
515
516// Return new bitpos
517static int encode_slice( const int *w_value,
518 const int *z_value,
519 int nvalues,
520 palette_t *p,
521 int new_palette,
522 int uncompressed_bits,
523 int w_cfg,
524 int z_cfg,
525 uint8_t *bitbuf,
526 int bitbuf_size,
527 int bitpos,
528 int verbose )
529{
530 int i,j;
531 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
532 bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
533 bb->pos = bitpos;
534
535 assert(nvalues<32768);
536 // GRC parameters for this slice
537 int w_grc_div = w_grc_params[w_cfg] & 15;
538 int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1;
539 int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2;
540 int z_grc_div = z_grc_params[z_cfg] & 15;
541
542 if (w_uncompressed) {
543 w_grc_div = uncompressed_bits;
544 }
545
546 int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
547 int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
548
549 if (verbose&1) {
550 printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
551 bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
552 }
553
554 // Write slice header
555 bitbuf_put( bb, "ZDIV", 3, zdiv);
556 bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
557 bitbuf_put( bb, "WDIV", 3, wdiv);
558 bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
559 bitbuf_put( bb, "NEWPAL", 1, new_palette );
560 if (new_palette) {
561 bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
562 bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
563 bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
564 for(i=0; i<p->palsize; i++) {
565 bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
566 }
567 }
568
569 int z_nvalues = nvalues + (new_palette?1:0);
570 int w_pos=0, z_pos=0;
571 int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
572 int z_unary=0, z_q=-1, z_r=0;
573 int w_nsymbols=0, w_remain[12]={0};
574 int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
575 int z_nsymbols=0, z_remain[12]={0};
576 int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
577 int z_unary_len = z_grc_div<3 ? 12 : 8;
578 do {
579 int balance = p->use_zero_runs ? w_pos - z_pos : 0;
580 int w_enable = balance<8 && w_pos<nvalues;
581 int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
582 if (w_enable) {
583 // Encode chunk (weights)
584 j=0;
585 w_nsymbols=0;
586 w_unary0=0;
587 w_unary1=0;
588 w_unary1_len=0;
589 int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
590 while(j<max_symbols) {
591 if (w_q<0) {
592 if (w_pos<nvalues) {
593 int value = w_value[w_pos];
594 assert(value<512);
595 w_q = value>>w_grc_div;
596 w_r = value&((1<<w_grc_div)-1);
597 assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
598 } else {
599 w_q = 0;
600 w_r = -1; // don't send remainder
601 }
602 }
603 while( w_q>=0 && j<max_symbols) {
604 w_unary0 |= w_q>0 ? (1<<j) : 0;
605 if (w_q>0) {
606 w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
607 w_unary1_len++;
608 }
609 j++;
610 w_q-=2;
611 if (w_grc_trunc)
612 w_q--;
613 }
614 if (w_q<0 && w_r>=0) {
615 w_remain[w_nsymbols] = w_r;
616 w_nsymbols++;
617 w_pos++;
618 }
619 }
620 }
621
622 if (z_enable) {
623 // Encode chunk (zrun)
624 j=0;
625 z_nsymbols=0;
626 z_unary=0;
627 while(j<z_unary_len) {
628 if (z_q<0) {
629 if (z_pos<z_nvalues) {
630 int value = z_value[z_pos];
631 z_q = value>>z_grc_div;
632 z_r = value&((1<<z_grc_div)-1);
633 } else {
634 z_q = 0;
635 z_r = -1;
636 }
637 }
638 while( z_q>=0 && j<z_unary_len) {
639 z_unary |= z_q>0 ? (1<<j) : 0;
640 j++;
641 z_q--;
642 }
643 if (z_q<0 && z_r>=0) {
644 z_remain[z_nsymbols] = z_r;
645 z_nsymbols++;
646 z_pos++;
647 }
648 }
649 }
650
651 // Write chunk to bitstream
652 if (w_enable && !w_uncompressed) {
653 bitbuf_put( bb, "WUNARY0", 12, w_unary0);
654 }
655 if (z_enable) {
656 bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
657 }
658 if (w_enable && !w_uncompressed) {
659 bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
660 }
661 if (w_prev_enable) {
662 for(i=0; i<w_prev_nsymbols; i++) {
663 bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
664 }
665 }
666 if (z_prev_enable) {
667 for(i=0; i<z_prev_nsymbols; i++) {
668 bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
669 }
670 }
671 w_prev_enable = w_enable;
672 w_prev_nsymbols = w_nsymbols;
673 memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
674 z_prev_enable = z_enable;
675 z_prev_nsymbols = z_nsymbols;
676 memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
677 } while( w_prev_enable || z_prev_enable );
678
679 return bb->pos;
680}
681
682
683// return new bitpos
684static int encode_section( const int16_t *inbuf,
685 int size,
686 palette_t *p,
687 uint8_t *bitbuf,
688 int bitbuf_size,
689 int bitpos,
690 int verbose )
691{
692 int uncompressed_bits;
693
694 // Uncompressed mode can only be used if either all weights
695 // are in the palette OR if the palette is not used.
696 if (p->only_palette) {
697 // Uncompressed bits derived from palette size
698 uncompressed_bits=0;
699 while( (1<<uncompressed_bits) < p->palsize )
700 uncompressed_bits++;
701 } else if (p->palsize==0) {
702 // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
703 uncompressed_bits = p->palbits;
704 } else {
705 // Don't use uncompressed
706 uncompressed_bits = 100;
707 }
708
709 int *weight_values = malloc( size*sizeof(int) );
710 int *zrun_values = malloc( size*sizeof(int) );
711
712 // Get weights (or weight indicies) AND zero-runs from the input weight stream.
713 int i=0, n_weights = 0, zcnt;
714 while(1) {
715 if (p->use_zero_runs) {
716 zcnt=0;
717 // Count zero run
718 // Special case: if all weights in the section are zero, we must
719 // still ensure we have one coded weight so the the slice length
720 // doesn't become 0. Therefore we skip the first zero run and code
721 // the zero explicitly as a weight value instead
722 if (!p->only_zeros || i>0) {
723 while( i<size && inbuf[i]==0) {
724 zcnt++;
725 i++;
726 }
727 }
728 zrun_values[n_weights] = zcnt;
729 }
730 if (i==size)
731 break;
732 int value = p->inv_lut[inbuf[i]+256];
733 weight_values[n_weights] = value;
734 n_weights++;
735 i++;
736 }
737
738 // Search for good GRC parameters for the weight stream
739 int n_w_slice, w_bitcnt;
740 uint8_t *w_slice_cfg;
741 int *w_slice_pos;
742 w_slice_cfg = malloc( size );
743 w_slice_pos = malloc( size*sizeof(int) );
744 n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
745 if (n_weights==0)
746 n_w_slice = 0;
747
748 // Search for good GRC parameters for the zrun stream
749 int n_z_slice=0, z_bitcnt=0;
750 uint8_t *z_slice_cfg=0;
751 int *z_slice_pos=0;
752 if (p->use_zero_runs) {
753 z_slice_cfg = malloc( size );
754 z_slice_pos = malloc( size*sizeof(int) );
755 n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
756 }
757
758 // Encode bitstream slice
759 int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
760 while(pos<n_weights || new_palette) {
761 int endpos=pos+32767; // max slice length
762
763 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
764 endpos = w_slice_pos[i_w_slice];
765 }
766
767 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
768 endpos = z_slice_pos[i_z_slice];
769 }
770
771 if (n_weights < endpos) {
772 endpos = n_weights;
773 }
774
775 // The first slice (when new_palette is 1) encodes zero runs both at the
776 // beginning and end (i.e. number of zero runs are len+1).
777 // The following slices only encode zero runs at the end (there cannot be
778 // any zeros in the beginning since they are encoded by the previous slice)
779 int len = endpos - pos;
780 int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
781 bitpos = encode_slice( weight_values+pos, zrun_buf, len,
782 p, new_palette, uncompressed_bits,
783 w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
784 bitbuf, bitbuf_size, bitpos, verbose );
785 new_palette = 0;
786
787 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
788 i_w_slice++;
789 }
790 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
791 i_z_slice++;
792 }
793 pos = endpos;
794 }
795
796 // Free temporary buffers
797 free(w_slice_cfg);
798 free(w_slice_pos);
799 if (p->use_zero_runs) {
800 free(z_slice_cfg);
801 free(z_slice_pos);
802 }
803 free(weight_values);
804 free(zrun_values);
805
806 return bitpos;
807}
808
809// Encode the given weight stream
810// inbuf uncompressed 9bit signed weights
811// inbuf_size number of weights
812// outbuf compressed bitstream, buffer is malloced
813// verbose if non-zero, printf log
814// Return value is the size in bytes of the compressed output
815// Return -1 if error
816int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
817 int i;
818 // Range check
819 for(i=0; i<inbuf_size; i++) {
820 if (inbuf[i]<-255 || inbuf[i]>255) {
821 printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
822 return -1;
823 }
824 }
825
826 int bitbuf_size = inbuf_size*2+1024;
827 *outbuf = malloc( bitbuf_size );
828
829 // Analyse input data to find palette re-programming points
830 int n_restarts;
831 int *palette_restart_pos;
832 n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
833
834 // Compress each section (using a single palette) separately
835 int bitpos=0;
836 for(i=0; i<n_restarts; i++) {
837 palette_t palette;
838 int pos, size;
839 pos = palette_restart_pos[i];
840 size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
841 find_palette( inbuf+pos, size, &palette);
842 create_inverse_palette( &palette);
843 bitpos = encode_section( inbuf+pos, size, &palette,
844 *outbuf, bitbuf_size, bitpos, verbose );
845 }
846
847
848 // Add end of stream marker and align to 128bit
849 {
850 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
851 bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
852 bb->pos = bitpos;
853 bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
854 bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
855
856 // Pad with 0xff until 64bit aligned
857 while( bb->pos & 127 ) {
858 bitbuf_put( bb, "PAD", 8, 0xff );
859 }
860 bitpos = bb->pos;
861 }
862 assert((bitpos&127)==0);
863 int outbuf_size = bitpos/8;
864 *outbuf = realloc( *outbuf, outbuf_size);
865
866 free(palette_restart_pos);
867
868 return outbuf_size;
869}
870
871void mlw_free_outbuf( uint8_t *outbuf ) {
872 if (outbuf)
873 free(outbuf);
874}