Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (c) 2020 Arm Limited. All rights reserved. |
| 3 | * |
| 4 | * SPDX-License-Identifier: Apache-2.0 |
| 5 | * |
| 6 | * Licensed under the Apache License, Version 2.0 (the License); you may |
| 7 | * not use this file except in compliance with the License. |
| 8 | * You may obtain a copy of the License at |
| 9 | * |
| 10 | * www.apache.org/licenses/LICENSE-2.0 |
| 11 | * |
| 12 | * Unless required by applicable law or agreed to in writing, software |
| 13 | * distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 14 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | * See the License for the specific language governing permissions and |
| 16 | * limitations under the License. |
| 17 | */ |
| 18 | |
| 19 | #include <stdio.h> |
| 20 | #include <stdlib.h> |
| 21 | #include <stdint.h> |
| 22 | #include <stdbool.h> |
| 23 | #include <string.h> |
| 24 | #include <assert.h> |
| 25 | #include <math.h> |
| 26 | #include <stdarg.h> |
| 27 | #include <math.h> |
| 28 | #include "mlw_common.h" |
| 29 | #include "mlw_encode.h" |
| 30 | |
| 31 | #define DPRINTF(...) |
| 32 | //#define DPRINTF(...) printf(__VA_ARGS__) |
| 33 | |
| 34 | #define ZERO_RUN_THRES 4 |
| 35 | |
| 36 | #define min(a,b) ((a)<(b)?(a):(b)) |
| 37 | #define max(a,b) ((a)>(b)?(a):(b)) |
| 38 | |
| 39 | typedef struct palette { |
| 40 | int16_t lut[32]; |
| 41 | int16_t inv_lut[512]; |
| 42 | int palsize; // number of palette entries |
| 43 | int palbits; // bit width of palette entries |
| 44 | int use_zero_runs; // zeros are coded separately |
| 45 | int only_palette; // no values outside the palette |
| 46 | int direct_offset; // added to the decoded weight index before direct conversion to sign/mag |
| 47 | int only_zeros; // special case that the section is all zeros |
| 48 | } palette_t; |
| 49 | |
| 50 | static int is_power_of_two( int x ) { |
| 51 | return ((x-1) & x)==0; |
| 52 | } |
| 53 | |
| 54 | static int get_palette_index_bits( int size ) { |
| 55 | int i; |
| 56 | for(i=7; i>=0; i--) |
| 57 | if (size > (1<<i) ) |
| 58 | return i+1; |
| 59 | return 0; |
| 60 | } |
| 61 | |
| 62 | // Search the stream for suitable palette restart positions |
| 63 | // Return the number of restarts |
| 64 | static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) { |
| 65 | int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt; |
| 66 | int prev_idx[512]; // For each value, keep track of the index of the previous occurence |
| 67 | int *restart_pos; |
| 68 | int max_palettes = size/64; |
| 69 | |
| 70 | // Preliminary allocation of sufficient size |
| 71 | restart_pos = (int*)malloc( max_palettes*sizeof(int) ); |
| 72 | last_restart_idx=0; |
| 73 | got_palette=0; |
| 74 | restart_i=1; |
| 75 | restart_pos[0] = 0; |
| 76 | zero_cnt=0; |
| 77 | memset( prev_idx, -1, sizeof(prev_idx)); |
| 78 | for(i=0; i<size; i++) { |
| 79 | // Guess if zeros should be excluded from the palette |
| 80 | int exclude_zero = zero_cnt > (i-last_restart_idx)/4; |
| 81 | |
| 82 | if (got_palette) { |
| 83 | // Check if the next value is not covered by the current palette |
| 84 | if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) { |
| 85 | // New value: increase the palette size |
| 86 | palette_size++; |
| 87 | DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size); |
| 88 | if ( is_power_of_two(palette_size-1-exclude_zero) ) { |
| 89 | if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) { |
| 90 | // create a new palette because we extend a long lasting palette to require one more index bit |
| 91 | DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt ); |
| 92 | assert( restart_i < max_palettes ); |
| 93 | DPRINTF("restart %d pos %d\n", restart_i, i); |
| 94 | restart_pos[restart_i++] = i; |
| 95 | last_restart_idx = i; |
| 96 | got_palette=0; |
| 97 | zero_cnt=0; |
| 98 | } |
| 99 | } |
| 100 | } |
| 101 | } |
| 102 | |
| 103 | prev_idx[ buf[i]+256 ] = i; |
| 104 | if (buf[i]==0) |
| 105 | zero_cnt++; |
| 106 | |
| 107 | static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}}; |
| 108 | int k; |
| 109 | // loop over window sizes |
| 110 | for(k=0; k<5; k++) { |
| 111 | // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ. |
| 112 | int N = window_sizes[k][0] * (got_palette?2:1); |
| 113 | if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) { |
| 114 | // Search backward to the position N nonzero values earlier |
| 115 | int nzcnt=0; |
| 116 | for( j=i; j>last_restart_idx; j--) { |
| 117 | if ( buf[j]!=0 ) { |
| 118 | if (nzcnt==N+1) |
| 119 | break; |
| 120 | nzcnt++; |
| 121 | } |
| 122 | } |
| 123 | int restart_idx = j; |
| 124 | |
| 125 | // Calculate the size of a new palette (starting at restart_idx) |
| 126 | int new_palette_size=0; |
| 127 | for(j=0; j<512; j++) { |
| 128 | if ( prev_idx[j] >= restart_idx ) { |
| 129 | new_palette_size++; |
| 130 | } |
| 131 | } |
| 132 | |
| 133 | int create_new_palette=0; |
| 134 | if (got_palette) { |
| 135 | int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); |
| 136 | int old_size_bits = get_palette_index_bits( palette_size - exclude_zero ); |
| 137 | int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20; |
| 138 | if ( savings>0 ) { |
| 139 | // Create new palette because it can be smaller than the existing palette |
| 140 | create_new_palette=1; |
| 141 | DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx); |
| 142 | } |
| 143 | } else { |
| 144 | if ( (new_palette_size-exclude_zero) <= 32) { |
| 145 | int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); |
| 146 | // estimate if we will make savings by using palette mode |
| 147 | int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20; |
| 148 | create_new_palette = savings>0; |
| 149 | } |
| 150 | } |
| 151 | if (create_new_palette) { |
| 152 | palette_size=new_palette_size; |
| 153 | got_palette=1; |
| 154 | last_restart_idx = restart_idx; |
| 155 | DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size); |
| 156 | if ( restart_pos[restart_i-1] != last_restart_idx) { |
| 157 | assert( restart_i < max_palettes ); |
| 158 | restart_pos[restart_i++] = last_restart_idx; |
| 159 | } |
| 160 | zero_cnt=0; |
| 161 | for( j=last_restart_idx; j<=i; j++) |
| 162 | if (buf[j]==0) |
| 163 | zero_cnt++; |
| 164 | } |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | // Reallocate to actual size |
| 169 | *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) ); |
| 170 | return restart_i; |
| 171 | } |
| 172 | |
| 173 | // Calculate frequency table |
| 174 | static void calc_freq( const int16_t *buf, int size, int freq[512] ) { |
| 175 | int i; |
| 176 | memset(freq, 0, 512*sizeof(int)); |
| 177 | for(i=0; i<size; i++) { |
| 178 | freq[buf[i]+256]++; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | static int cmp_uint64(const void * a, const void * b) { |
| 183 | uint64_t aa = *(uint64_t*)a; |
| 184 | uint64_t bb = *(uint64_t*)b; |
| 185 | return aa>bb ? -1 : aa<bb ? 1 : 0; |
| 186 | } |
| 187 | |
| 188 | // Create palette from the given frequencies |
| 189 | // Freq index 0-511 correspond to weights -256..255 |
| 190 | static void create_palette( int freq[512], |
| 191 | int use_zero_runs, |
| 192 | palette_t *p ) { |
| 193 | uint64_t freq64[512]; |
| 194 | int i,all_cnt,all_max_val; |
| 195 | |
| 196 | // Pair the frequency with the value so that |
| 197 | // the array can be sorted on frequency while keeping |
| 198 | // track of the corresponding palette value |
| 199 | memset(freq64, 0, sizeof(freq64)); |
| 200 | all_cnt=0; |
| 201 | all_max_val=0; |
| 202 | for(i=-255; i<256; i++) { |
| 203 | if (i==0 && use_zero_runs) |
| 204 | continue; |
| 205 | int sign = i<0; |
| 206 | int mag = abs(i); |
| 207 | int palval = (mag<<1) | sign; |
| 208 | |
| 209 | // Store palette value in 16 LSB bits, which will not affect the sorting |
| 210 | freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval; |
| 211 | all_cnt+=freq[i+256]; |
| 212 | |
| 213 | if (freq[i+256]>0) { |
| 214 | all_max_val = max(all_max_val, palval); |
| 215 | } |
| 216 | } |
| 217 | |
| 218 | // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc) |
| 219 | for(i=0; i<31; i++) { |
| 220 | if ((freq64[i]>>16)!=0) |
| 221 | break; |
| 222 | } |
| 223 | p->direct_offset = i; |
| 224 | |
| 225 | // Sort in descending frequency order |
| 226 | qsort(freq64, 512, sizeof(uint64_t), cmp_uint64); |
| 227 | |
| 228 | // Identify special case that there are no weights to code |
| 229 | // in the weight index stream (i.e. all weights are zeros) |
| 230 | p->only_zeros = (freq64[0]>>16)==0; |
| 231 | if (p->only_zeros) { |
| 232 | p->direct_offset=0; |
| 233 | } |
| 234 | |
| 235 | // Check if all weights fit into the palette (and the palette is not empty) |
| 236 | p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0; |
| 237 | |
| 238 | int max_palette_size; |
| 239 | if (p->only_palette) { |
| 240 | max_palette_size = 32; |
| 241 | } else { |
| 242 | // For direct-lut we must make sure that the encoded weight |
| 243 | // index is not > 511. We do that by limiting the palette size |
| 244 | // such that the greatest value can be reached after subtracting |
| 245 | // the palette size. |
| 246 | max_palette_size = min(32, 511-all_max_val); |
| 247 | if (max_palette_size==1) { |
| 248 | max_palette_size=0; // because palette of size 1 is not supported |
| 249 | } |
| 250 | } |
| 251 | |
| 252 | // Setup the 32 entry palette |
| 253 | int palette_max_val = 0, val, cnt, pal_cnt=0; |
| 254 | for(i=0; i<max_palette_size; i++) { |
| 255 | cnt = freq64[i]>>16; |
| 256 | val = freq64[i]&0xffff; |
| 257 | if ( cnt==0 ) |
| 258 | break; |
| 259 | p->lut[i] = val; |
| 260 | palette_max_val = max(palette_max_val, val); |
| 261 | pal_cnt+=cnt; |
| 262 | } |
| 263 | if (i==1) |
| 264 | i++; // palette size of 1 is not supported, make it 2 |
| 265 | |
| 266 | // Heuristic for when to use the palette. If more than half of the |
| 267 | // weights are in the palette then we use it. This ensures we don't |
| 268 | // use palette for e.g. rectangular distributions. |
| 269 | int palbits_val; |
| 270 | if (pal_cnt > all_cnt/2) { |
| 271 | p->palsize = i; |
| 272 | palbits_val = palette_max_val; |
| 273 | } else { |
| 274 | // No palette |
| 275 | p->palsize = 0; |
| 276 | // If no palette, then palbits is used to specify the |
| 277 | // number of bits required for uncompressed mode, i.e. |
| 278 | // the number of bits for the greatest weight value |
| 279 | palbits_val = all_max_val; |
| 280 | } |
| 281 | |
| 282 | // the palette entry bit width |
| 283 | // minimum 2bits (because PALBITS is in range 2..9) |
| 284 | int palbits=2; |
| 285 | while( (1<<palbits) <= palbits_val ) |
| 286 | palbits++; |
| 287 | assert(palbits<=9); |
| 288 | p->palbits = palbits; |
| 289 | p->use_zero_runs = use_zero_runs; |
| 290 | } |
| 291 | |
| 292 | // Return 1 if zero runs should be used |
| 293 | // If palette_size is 512, then palette is not used (in that case the palette is setup |
| 294 | // with the standard alternating unsigned to signed mapping) |
| 295 | static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) { |
| 296 | int freq[512], i; |
| 297 | |
| 298 | // Calculate frequencies of the given weight stream |
| 299 | calc_freq( inbuf, inbuf_size, freq); |
| 300 | |
| 301 | // Find two most common values |
| 302 | int most_common_freq[2]={0}, most_common_val[2]={0}; |
| 303 | for(i=0; i<512; i++) { |
| 304 | if ( freq[i] > most_common_freq[0] ) { |
| 305 | most_common_freq[1] = most_common_freq[0]; |
| 306 | most_common_val[1] = most_common_val[0]; |
| 307 | most_common_freq[0] = freq[i]; |
| 308 | most_common_val[0] = i-256; |
| 309 | } else if ( freq[i] > most_common_freq[1] ) { |
| 310 | most_common_freq[1] = freq[i]; |
| 311 | most_common_val[1] = i-256; |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | // Decide if zero-runs (alternating mode) should be used: |
| 316 | // * zero should be the most common symbol |
| 317 | // * zero should be sufficiently more common than the second most common symbol |
| 318 | int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1]; |
| 319 | |
| 320 | // Create the palette |
| 321 | create_palette( freq, use_zero_runs, p); |
| 322 | |
| 323 | return use_zero_runs; |
| 324 | } |
| 325 | |
| 326 | static void create_inverse_palette( palette_t *p) { |
| 327 | int i; |
| 328 | memset( p->inv_lut, 0, sizeof(p->inv_lut)); |
| 329 | for(i=0; i<512; i++) { |
| 330 | int val = i; |
| 331 | int sign = val&1; |
| 332 | int mag = val>>1; |
| 333 | int weight = sign ? -mag : mag; |
| 334 | if (weight+256 < 512) |
| 335 | p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset; |
| 336 | } |
| 337 | for(i=0; i<p->palsize; i++) { |
| 338 | int val = p->lut[i]; |
| 339 | int sign = val&1; |
| 340 | int mag = val>>1; |
| 341 | int weight = sign ? -mag : mag; |
| 342 | if (weight+256 < 512) |
| 343 | p->inv_lut[ weight+256 ] = i; |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | #define NWCFG 13 |
| 348 | #define NZCFG 4 // restrict search to ZDIV=0..3 |
| 349 | #define MAX_ZWCFG (max(NWCFG,NZCFG)) |
| 350 | |
| 351 | // search state |
| 352 | typedef struct search_state { |
| 353 | int bitcnt; // number of bits to reach this state |
| 354 | uint8_t prev_cfg; // previous grc parameter config |
| 355 | } search_state_t; |
| 356 | |
| 357 | // (trunc<<4) | div, 0x20 means uncompressed |
| 358 | static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 }; |
| 359 | static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 }; |
| 360 | |
| 361 | |
| 362 | |
| 363 | // An algorithm similar to the Viterbi algorithm is used to search for a |
| 364 | // good GRC parameter sequence for the given input value sequence. |
| 365 | // The inval buffer can contain weights, weight indices or runs. |
| 366 | // The return value is the resulting number of bitstream sections. |
| 367 | static int search_grc_params( const int *inval_buf, |
| 368 | int n_inval, |
| 369 | int zrun_mode, |
| 370 | int uncompressed_bits, |
| 371 | uint8_t *grc_param_cfg, |
| 372 | int *grc_param_pos, |
| 373 | int max_grc_param_cfg, |
| 374 | int *existing_grc_param_pos, |
| 375 | int n_existing_grc_param_pos, |
| 376 | int *bitcnt ) |
| 377 | { |
| 378 | int n_cfg = zrun_mode ? NZCFG : NWCFG; |
| 379 | const char *grc_params = zrun_mode ? z_grc_params : w_grc_params; |
| 380 | int i,j; |
| 381 | |
| 382 | search_state_t *state[MAX_ZWCFG]; |
| 383 | for(i=0; i<n_cfg; i++) { |
| 384 | state[i] = malloc( sizeof(search_state_t) * (n_inval+1) ); |
| 385 | state[i][0].bitcnt=0; |
| 386 | state[i][0].prev_cfg=i; |
| 387 | } |
| 388 | |
| 389 | // Loop over inval_buf |
| 390 | int existing_idx=0; |
| 391 | for(i=0; i<n_inval; i++) { |
| 392 | int value = inval_buf[i]; |
| 393 | |
| 394 | // Best GRC parameter so far |
| 395 | int best_bitcnt=0x7fffffff, best_cfg=0; |
| 396 | for(j=0; j<n_cfg; j++) { |
| 397 | if (state[j][i].bitcnt < best_bitcnt) { |
| 398 | best_bitcnt = state[j][i].bitcnt; |
| 399 | best_cfg = j; |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | int cmd_cost = 40; |
| 404 | if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) { |
| 405 | // free transition, because the weight stream already inserted a command at this position |
| 406 | cmd_cost = 0; |
| 407 | existing_idx++; |
| 408 | } |
| 409 | |
| 410 | // Loop over GRC parameters, calculate bits to code value, and then update the search state |
| 411 | for(j=0; j<n_cfg; j++) { |
| 412 | int div = grc_params[j]&15; |
| 413 | int trunc = grc_params[j]>>4; |
| 414 | int q = value>>div; |
| 415 | int bits = trunc ? min(q+1,2) + div : q+1+div; |
| 416 | if (!zrun_mode && ((trunc && q>2) || q>31)) |
| 417 | bits=10000; // it's not possible to code the current value; give it a high cost |
| 418 | if (trunc==2) |
| 419 | bits=uncompressed_bits; |
| 420 | |
| 421 | if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) { |
| 422 | // Change GRC parameters |
| 423 | state[j][i+1].prev_cfg = best_cfg; |
| 424 | state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits; |
| 425 | } else { |
| 426 | // Keep same GRC parameters |
| 427 | state[j][i+1].prev_cfg = j; |
| 428 | state[j][i+1].bitcnt = state[j][i].bitcnt + bits; |
| 429 | } |
| 430 | } |
| 431 | } |
| 432 | |
| 433 | |
| 434 | // Best GRC parameter |
| 435 | int best_bitcnt=0x7fffffff, best_cfg=0; |
| 436 | for(j=0; j<n_cfg; j++) { |
| 437 | if (state[j][n_inval].bitcnt < best_bitcnt) { |
| 438 | best_bitcnt = state[j][n_inval].bitcnt; |
| 439 | best_cfg = j; |
| 440 | } |
| 441 | } |
| 442 | |
| 443 | int cfg = best_cfg; |
| 444 | int n_cmds=0; |
| 445 | for(i=n_inval; i>=0; i--) { |
| 446 | if (state[cfg][i].prev_cfg != cfg || i==0) { |
| 447 | n_cmds++; |
| 448 | cfg = state[cfg][i].prev_cfg; |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | (void)(max_grc_param_cfg); |
| 453 | assert(n_cmds<=max_grc_param_cfg); |
| 454 | |
| 455 | cfg = best_cfg; |
| 456 | j=n_cmds-1; |
| 457 | int endpos=n_inval; |
| 458 | for(i=n_inval; i>=0; i--) { |
| 459 | if (state[cfg][i].prev_cfg != cfg || i==0) { |
| 460 | grc_param_cfg[j] = cfg; |
| 461 | grc_param_pos[j] = endpos; |
| 462 | j--; |
| 463 | cfg = state[cfg][i].prev_cfg; |
| 464 | endpos = i-1; |
| 465 | } |
| 466 | } |
| 467 | assert(j==-1); |
| 468 | |
| 469 | for(i=0; i<n_cfg; i++) { |
| 470 | free(state[i]); |
| 471 | } |
| 472 | |
| 473 | *bitcnt = best_bitcnt; |
| 474 | |
| 475 | return n_cmds; |
| 476 | } |
| 477 | |
| 478 | |
| 479 | /////////////////////////////// Write to bitstream |
| 480 | |
| 481 | typedef struct bitbuf { |
| 482 | uint8_t *buf; |
| 483 | int buf_size; // in bytes |
| 484 | int pos; // bit pos of next bit |
| 485 | int log_symbols; |
| 486 | } bitbuf_t; |
| 487 | |
| 488 | // size in byte |
| 489 | static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) { |
| 490 | bb->buf = buf; |
| 491 | bb->pos = 0; |
| 492 | bb->buf_size = size; |
| 493 | bb->log_symbols = log_symbols; |
| 494 | } |
| 495 | |
| 496 | static void bitbuf_putbit( bitbuf_t *bb, int bit) { |
| 497 | int byte_pos = bb->pos>>3; |
| 498 | int bit_pos = bb->pos&7; |
| 499 | assert( byte_pos >= 0 ); |
| 500 | assert( byte_pos < bb->buf_size ); |
| 501 | bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<<bit_pos)) | (bit<<bit_pos); |
| 502 | bb->pos += 1; |
| 503 | } |
| 504 | |
| 505 | static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) { |
| 506 | int i; |
| 507 | if (len>0) { |
| 508 | if (bb->log_symbols) |
| 509 | printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data); |
| 510 | for(i=0; i<len; i++) { |
| 511 | bitbuf_putbit(bb, (data>>i)&1); |
| 512 | } |
| 513 | } |
| 514 | } |
| 515 | |
| 516 | // Return new bitpos |
| 517 | static int encode_slice( const int *w_value, |
| 518 | const int *z_value, |
| 519 | int nvalues, |
| 520 | palette_t *p, |
| 521 | int new_palette, |
| 522 | int uncompressed_bits, |
| 523 | int w_cfg, |
| 524 | int z_cfg, |
| 525 | uint8_t *bitbuf, |
| 526 | int bitbuf_size, |
| 527 | int bitpos, |
| 528 | int verbose ) |
| 529 | { |
| 530 | int i,j; |
| 531 | bitbuf_t bitbuf_s, *bb=&bitbuf_s; |
| 532 | bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 ); |
| 533 | bb->pos = bitpos; |
| 534 | |
| 535 | assert(nvalues<32768); |
| 536 | // GRC parameters for this slice |
| 537 | int w_grc_div = w_grc_params[w_cfg] & 15; |
| 538 | int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1; |
| 539 | int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2; |
| 540 | int z_grc_div = z_grc_params[z_cfg] & 15; |
| 541 | |
| 542 | if (w_uncompressed) { |
| 543 | w_grc_div = uncompressed_bits; |
| 544 | } |
| 545 | |
| 546 | int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE; |
| 547 | int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED; |
| 548 | |
| 549 | if (verbose&1) { |
| 550 | printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n", |
| 551 | bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize); |
| 552 | } |
| 553 | |
| 554 | // Write slice header |
| 555 | bitbuf_put( bb, "ZDIV", 3, zdiv); |
| 556 | bitbuf_put( bb, "SLICELEN", 15, nvalues-1 ); |
| 557 | bitbuf_put( bb, "WDIV", 3, wdiv); |
| 558 | bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc ); |
| 559 | bitbuf_put( bb, "NEWPAL", 1, new_palette ); |
| 560 | if (new_palette) { |
| 561 | bitbuf_put( bb, "DIROFS", 5, p->direct_offset ); |
| 562 | bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1)); |
| 563 | bitbuf_put( bb, "PALBITS", 3, p->palbits-2 ); |
| 564 | for(i=0; i<p->palsize; i++) { |
| 565 | bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] ); |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | int z_nvalues = nvalues + (new_palette?1:0); |
| 570 | int w_pos=0, z_pos=0; |
| 571 | int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0; |
| 572 | int z_unary=0, z_q=-1, z_r=0; |
| 573 | int w_nsymbols=0, w_remain[12]={0}; |
| 574 | int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0}; |
| 575 | int z_nsymbols=0, z_remain[12]={0}; |
| 576 | int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0}; |
| 577 | int z_unary_len = z_grc_div<3 ? 12 : 8; |
| 578 | do { |
| 579 | int balance = p->use_zero_runs ? w_pos - z_pos : 0; |
| 580 | int w_enable = balance<8 && w_pos<nvalues; |
| 581 | int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues; |
| 582 | if (w_enable) { |
| 583 | // Encode chunk (weights) |
| 584 | j=0; |
| 585 | w_nsymbols=0; |
| 586 | w_unary0=0; |
| 587 | w_unary1=0; |
| 588 | w_unary1_len=0; |
| 589 | int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12; |
| 590 | while(j<max_symbols) { |
| 591 | if (w_q<0) { |
| 592 | if (w_pos<nvalues) { |
| 593 | int value = w_value[w_pos]; |
| 594 | assert(value<512); |
| 595 | w_q = value>>w_grc_div; |
| 596 | w_r = value&((1<<w_grc_div)-1); |
| 597 | assert( w_q<=31 && (!w_grc_trunc || w_q<=2)); |
| 598 | } else { |
| 599 | w_q = 0; |
| 600 | w_r = -1; // don't send remainder |
| 601 | } |
| 602 | } |
| 603 | while( w_q>=0 && j<max_symbols) { |
| 604 | w_unary0 |= w_q>0 ? (1<<j) : 0; |
| 605 | if (w_q>0) { |
| 606 | w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0; |
| 607 | w_unary1_len++; |
| 608 | } |
| 609 | j++; |
| 610 | w_q-=2; |
| 611 | if (w_grc_trunc) |
| 612 | w_q--; |
| 613 | } |
| 614 | if (w_q<0 && w_r>=0) { |
| 615 | w_remain[w_nsymbols] = w_r; |
| 616 | w_nsymbols++; |
| 617 | w_pos++; |
| 618 | } |
| 619 | } |
| 620 | } |
| 621 | |
| 622 | if (z_enable) { |
| 623 | // Encode chunk (zrun) |
| 624 | j=0; |
| 625 | z_nsymbols=0; |
| 626 | z_unary=0; |
| 627 | while(j<z_unary_len) { |
| 628 | if (z_q<0) { |
| 629 | if (z_pos<z_nvalues) { |
| 630 | int value = z_value[z_pos]; |
| 631 | z_q = value>>z_grc_div; |
| 632 | z_r = value&((1<<z_grc_div)-1); |
| 633 | } else { |
| 634 | z_q = 0; |
| 635 | z_r = -1; |
| 636 | } |
| 637 | } |
| 638 | while( z_q>=0 && j<z_unary_len) { |
| 639 | z_unary |= z_q>0 ? (1<<j) : 0; |
| 640 | j++; |
| 641 | z_q--; |
| 642 | } |
| 643 | if (z_q<0 && z_r>=0) { |
| 644 | z_remain[z_nsymbols] = z_r; |
| 645 | z_nsymbols++; |
| 646 | z_pos++; |
| 647 | } |
| 648 | } |
| 649 | } |
| 650 | |
| 651 | // Write chunk to bitstream |
| 652 | if (w_enable && !w_uncompressed) { |
| 653 | bitbuf_put( bb, "WUNARY0", 12, w_unary0); |
| 654 | } |
| 655 | if (z_enable) { |
| 656 | bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary); |
| 657 | } |
| 658 | if (w_enable && !w_uncompressed) { |
| 659 | bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1); |
| 660 | } |
| 661 | if (w_prev_enable) { |
| 662 | for(i=0; i<w_prev_nsymbols; i++) { |
| 663 | bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]); |
| 664 | } |
| 665 | } |
| 666 | if (z_prev_enable) { |
| 667 | for(i=0; i<z_prev_nsymbols; i++) { |
| 668 | bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]); |
| 669 | } |
| 670 | } |
| 671 | w_prev_enable = w_enable; |
| 672 | w_prev_nsymbols = w_nsymbols; |
| 673 | memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain)); |
| 674 | z_prev_enable = z_enable; |
| 675 | z_prev_nsymbols = z_nsymbols; |
| 676 | memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain)); |
| 677 | } while( w_prev_enable || z_prev_enable ); |
| 678 | |
| 679 | return bb->pos; |
| 680 | } |
| 681 | |
| 682 | |
| 683 | // return new bitpos |
| 684 | static int encode_section( const int16_t *inbuf, |
| 685 | int size, |
| 686 | palette_t *p, |
| 687 | uint8_t *bitbuf, |
| 688 | int bitbuf_size, |
| 689 | int bitpos, |
| 690 | int verbose ) |
| 691 | { |
| 692 | int uncompressed_bits; |
| 693 | |
| 694 | // Uncompressed mode can only be used if either all weights |
| 695 | // are in the palette OR if the palette is not used. |
| 696 | if (p->only_palette) { |
| 697 | // Uncompressed bits derived from palette size |
| 698 | uncompressed_bits=0; |
| 699 | while( (1<<uncompressed_bits) < p->palsize ) |
| 700 | uncompressed_bits++; |
| 701 | } else if (p->palsize==0) { |
| 702 | // Uncompressed bits is palbits (which is the bitdepth of the greatest weight) |
| 703 | uncompressed_bits = p->palbits; |
| 704 | } else { |
| 705 | // Don't use uncompressed |
| 706 | uncompressed_bits = 100; |
| 707 | } |
| 708 | |
| 709 | int *weight_values = malloc( size*sizeof(int) ); |
| 710 | int *zrun_values = malloc( size*sizeof(int) ); |
| 711 | |
| 712 | // Get weights (or weight indicies) AND zero-runs from the input weight stream. |
| 713 | int i=0, n_weights = 0, zcnt; |
| 714 | while(1) { |
| 715 | if (p->use_zero_runs) { |
| 716 | zcnt=0; |
| 717 | // Count zero run |
| 718 | // Special case: if all weights in the section are zero, we must |
| 719 | // still ensure we have one coded weight so the the slice length |
| 720 | // doesn't become 0. Therefore we skip the first zero run and code |
| 721 | // the zero explicitly as a weight value instead |
| 722 | if (!p->only_zeros || i>0) { |
| 723 | while( i<size && inbuf[i]==0) { |
| 724 | zcnt++; |
| 725 | i++; |
| 726 | } |
| 727 | } |
| 728 | zrun_values[n_weights] = zcnt; |
| 729 | } |
| 730 | if (i==size) |
| 731 | break; |
| 732 | int value = p->inv_lut[inbuf[i]+256]; |
| 733 | weight_values[n_weights] = value; |
| 734 | n_weights++; |
| 735 | i++; |
| 736 | } |
| 737 | |
| 738 | // Search for good GRC parameters for the weight stream |
| 739 | int n_w_slice, w_bitcnt; |
| 740 | uint8_t *w_slice_cfg; |
| 741 | int *w_slice_pos; |
| 742 | w_slice_cfg = malloc( size ); |
| 743 | w_slice_pos = malloc( size*sizeof(int) ); |
| 744 | n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt); |
| 745 | if (n_weights==0) |
| 746 | n_w_slice = 0; |
| 747 | |
| 748 | // Search for good GRC parameters for the zrun stream |
| 749 | int n_z_slice=0, z_bitcnt=0; |
| 750 | uint8_t *z_slice_cfg=0; |
| 751 | int *z_slice_pos=0; |
| 752 | if (p->use_zero_runs) { |
| 753 | z_slice_cfg = malloc( size ); |
| 754 | z_slice_pos = malloc( size*sizeof(int) ); |
| 755 | n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt); |
| 756 | } |
| 757 | |
| 758 | // Encode bitstream slice |
| 759 | int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1; |
| 760 | while(pos<n_weights || new_palette) { |
| 761 | int endpos=pos+32767; // max slice length |
| 762 | |
| 763 | if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) { |
| 764 | endpos = w_slice_pos[i_w_slice]; |
| 765 | } |
| 766 | |
| 767 | if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) { |
| 768 | endpos = z_slice_pos[i_z_slice]; |
| 769 | } |
| 770 | |
| 771 | if (n_weights < endpos) { |
| 772 | endpos = n_weights; |
| 773 | } |
| 774 | |
| 775 | // The first slice (when new_palette is 1) encodes zero runs both at the |
| 776 | // beginning and end (i.e. number of zero runs are len+1). |
| 777 | // The following slices only encode zero runs at the end (there cannot be |
| 778 | // any zeros in the beginning since they are encoded by the previous slice) |
| 779 | int len = endpos - pos; |
| 780 | int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0; |
| 781 | bitpos = encode_slice( weight_values+pos, zrun_buf, len, |
| 782 | p, new_palette, uncompressed_bits, |
| 783 | w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0, |
| 784 | bitbuf, bitbuf_size, bitpos, verbose ); |
| 785 | new_palette = 0; |
| 786 | |
| 787 | if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) { |
| 788 | i_w_slice++; |
| 789 | } |
| 790 | if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) { |
| 791 | i_z_slice++; |
| 792 | } |
| 793 | pos = endpos; |
| 794 | } |
| 795 | |
| 796 | // Free temporary buffers |
| 797 | free(w_slice_cfg); |
| 798 | free(w_slice_pos); |
| 799 | if (p->use_zero_runs) { |
| 800 | free(z_slice_cfg); |
| 801 | free(z_slice_pos); |
| 802 | } |
| 803 | free(weight_values); |
| 804 | free(zrun_values); |
| 805 | |
| 806 | return bitpos; |
| 807 | } |
| 808 | |
| 809 | // Encode the given weight stream |
| 810 | // inbuf uncompressed 9bit signed weights |
| 811 | // inbuf_size number of weights |
| 812 | // outbuf compressed bitstream, buffer is malloced |
| 813 | // verbose if non-zero, printf log |
| 814 | // Return value is the size in bytes of the compressed output |
| 815 | // Return -1 if error |
| 816 | int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) { |
| 817 | int i; |
| 818 | // Range check |
| 819 | for(i=0; i<inbuf_size; i++) { |
| 820 | if (inbuf[i]<-255 || inbuf[i]>255) { |
| 821 | printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]); |
| 822 | return -1; |
| 823 | } |
| 824 | } |
| 825 | |
| 826 | int bitbuf_size = inbuf_size*2+1024; |
| 827 | *outbuf = malloc( bitbuf_size ); |
| 828 | |
| 829 | // Analyse input data to find palette re-programming points |
| 830 | int n_restarts; |
| 831 | int *palette_restart_pos; |
| 832 | n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos); |
| 833 | |
| 834 | // Compress each section (using a single palette) separately |
| 835 | int bitpos=0; |
| 836 | for(i=0; i<n_restarts; i++) { |
| 837 | palette_t palette; |
| 838 | int pos, size; |
| 839 | pos = palette_restart_pos[i]; |
| 840 | size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos; |
| 841 | find_palette( inbuf+pos, size, &palette); |
| 842 | create_inverse_palette( &palette); |
| 843 | bitpos = encode_section( inbuf+pos, size, &palette, |
| 844 | *outbuf, bitbuf_size, bitpos, verbose ); |
| 845 | } |
| 846 | |
| 847 | |
| 848 | // Add end of stream marker and align to 128bit |
| 849 | { |
| 850 | bitbuf_t bitbuf_s, *bb=&bitbuf_s; |
| 851 | bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 ); |
| 852 | bb->pos = bitpos; |
| 853 | bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS); |
| 854 | bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff ); |
| 855 | |
| 856 | // Pad with 0xff until 64bit aligned |
| 857 | while( bb->pos & 127 ) { |
| 858 | bitbuf_put( bb, "PAD", 8, 0xff ); |
| 859 | } |
| 860 | bitpos = bb->pos; |
| 861 | } |
| 862 | assert((bitpos&127)==0); |
| 863 | int outbuf_size = bitpos/8; |
| 864 | *outbuf = realloc( *outbuf, outbuf_size); |
| 865 | |
| 866 | free(palette_restart_pos); |
| 867 | |
| 868 | return outbuf_size; |
| 869 | } |
| 870 | |
| 871 | void mlw_free_outbuf( uint8_t *outbuf ) { |
| 872 | if (outbuf) |
| 873 | free(outbuf); |
| 874 | } |