blob: 3ec24908c7ec2a83576e10739700e3cbdab5af7e [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001/*
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +01002 * SPDX-FileCopyrightText: Copyright 2020-2022, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01003 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#include <stdio.h>
20#include <stdlib.h>
21#include <stdint.h>
22#include <stdbool.h>
23#include <string.h>
24#include <assert.h>
25#include <math.h>
26#include <stdarg.h>
27#include <math.h>
28#include "mlw_common.h"
29#include "mlw_encode.h"
30
31#define DPRINTF(...)
32//#define DPRINTF(...) printf(__VA_ARGS__)
33
34#define ZERO_RUN_THRES 4
35
Fredrik Svedberg5b513882020-12-11 13:42:22 +010036#ifndef min
Tim Hall79d07d22020-04-27 18:20:16 +010037#define min(a,b) ((a)<(b)?(a):(b))
Fredrik Svedberg5b513882020-12-11 13:42:22 +010038#endif
39#ifndef max
Tim Hall79d07d22020-04-27 18:20:16 +010040#define max(a,b) ((a)>(b)?(a):(b))
Fredrik Svedberg5b513882020-12-11 13:42:22 +010041#endif
Tim Hall79d07d22020-04-27 18:20:16 +010042
Fredrik Svedbergdb947e42022-11-22 15:47:24 +010043#define CHECKED_MALLOC(var, size) { if ( !(var = malloc(size)) ) break; }
44
Tim Hall79d07d22020-04-27 18:20:16 +010045typedef struct palette {
46 int16_t lut[32];
47 int16_t inv_lut[512];
48 int palsize; // number of palette entries
49 int palbits; // bit width of palette entries
50 int use_zero_runs; // zeros are coded separately
51 int only_palette; // no values outside the palette
52 int direct_offset; // added to the decoded weight index before direct conversion to sign/mag
53 int only_zeros; // special case that the section is all zeros
54} palette_t;
55
56static int is_power_of_two( int x ) {
57 return ((x-1) & x)==0;
58}
59
Fredrik Svedbergdb947e42022-11-22 15:47:24 +010060static int round_up_divide(int num, int den)
61{
62 return (num + den - 1) / den;
63}
64
65static int round_up(int num, int den)
66{
67 return round_up_divide(num, den) * den;
68}
69
Tim Hall79d07d22020-04-27 18:20:16 +010070static int get_palette_index_bits( int size ) {
71 int i;
72 for(i=7; i>=0; i--)
73 if (size > (1<<i) )
74 return i+1;
75 return 0;
76}
77
78// Search the stream for suitable palette restart positions
79// Return the number of restarts
80static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
81 int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
82 int prev_idx[512]; // For each value, keep track of the index of the previous occurence
83 int *restart_pos;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +010084 int max_palettes = round_up_divide(size, 64);
85 *palette_restart_positions = NULL;
Tim Hall79d07d22020-04-27 18:20:16 +010086
87 // Preliminary allocation of sufficient size
88 restart_pos = (int*)malloc( max_palettes*sizeof(int) );
Fredrik Svedbergdb947e42022-11-22 15:47:24 +010089 if (!restart_pos) {
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +010090 return -1;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +010091 }
Tim Hall79d07d22020-04-27 18:20:16 +010092 last_restart_idx=0;
93 got_palette=0;
94 restart_i=1;
95 restart_pos[0] = 0;
96 zero_cnt=0;
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +010097 memset(prev_idx, -1, sizeof(prev_idx));
Tim Hall79d07d22020-04-27 18:20:16 +010098 for(i=0; i<size; i++) {
99 // Guess if zeros should be excluded from the palette
100 int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
101
102 if (got_palette) {
103 // Check if the next value is not covered by the current palette
104 if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
105 // New value: increase the palette size
106 palette_size++;
107 DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
108 if ( is_power_of_two(palette_size-1-exclude_zero) ) {
109 if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
110 // create a new palette because we extend a long lasting palette to require one more index bit
111 DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
Tim Hallbb330392020-04-29 15:13:00 +0100112 if (restart_i == max_palettes) {
113 max_palettes = max_palettes*2;
114 restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100115 if (!restart_pos) {
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100116 return -1;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100117 }
Tim Hallbb330392020-04-29 15:13:00 +0100118 }
Tim Hall79d07d22020-04-27 18:20:16 +0100119 DPRINTF("restart %d pos %d\n", restart_i, i);
120 restart_pos[restart_i++] = i;
121 last_restart_idx = i;
122 got_palette=0;
123 zero_cnt=0;
124 }
125 }
126 }
127 }
128
129 prev_idx[ buf[i]+256 ] = i;
130 if (buf[i]==0)
131 zero_cnt++;
132
133 static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
134 int k;
135 // loop over window sizes
136 for(k=0; k<5; k++) {
137 // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
138 int N = window_sizes[k][0] * (got_palette?2:1);
139 if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
140 // Search backward to the position N nonzero values earlier
141 int nzcnt=0;
142 for( j=i; j>last_restart_idx; j--) {
143 if ( buf[j]!=0 ) {
144 if (nzcnt==N+1)
145 break;
146 nzcnt++;
147 }
148 }
149 int restart_idx = j;
150
151 // Calculate the size of a new palette (starting at restart_idx)
152 int new_palette_size=0;
153 for(j=0; j<512; j++) {
154 if ( prev_idx[j] >= restart_idx ) {
155 new_palette_size++;
156 }
157 }
158
159 int create_new_palette=0;
160 if (got_palette) {
161 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
162 int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
163 int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
164 if ( savings>0 ) {
165 // Create new palette because it can be smaller than the existing palette
166 create_new_palette=1;
167 DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
168 }
169 } else {
170 if ( (new_palette_size-exclude_zero) <= 32) {
171 int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
172 // estimate if we will make savings by using palette mode
173 int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
174 create_new_palette = savings>0;
175 }
176 }
177 if (create_new_palette) {
178 palette_size=new_palette_size;
179 got_palette=1;
180 last_restart_idx = restart_idx;
181 DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
182 if ( restart_pos[restart_i-1] != last_restart_idx) {
Tim Hallbb330392020-04-29 15:13:00 +0100183 if (restart_i == max_palettes) {
184 max_palettes = max_palettes*2;
185 restart_pos = (int*)realloc( restart_pos, max_palettes*sizeof(int) );
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100186 if (!restart_pos) {
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100187 return -1;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100188 }
Tim Hallbb330392020-04-29 15:13:00 +0100189 }
Tim Hall79d07d22020-04-27 18:20:16 +0100190 restart_pos[restart_i++] = last_restart_idx;
191 }
192 zero_cnt=0;
193 for( j=last_restart_idx; j<=i; j++)
194 if (buf[j]==0)
195 zero_cnt++;
196 }
197 }
198 }
199 }
200 // Reallocate to actual size
201 *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100202 return *palette_restart_positions ? restart_i : -1;
Tim Hall79d07d22020-04-27 18:20:16 +0100203}
204
205// Calculate frequency table
206static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
207 int i;
208 memset(freq, 0, 512*sizeof(int));
209 for(i=0; i<size; i++) {
210 freq[buf[i]+256]++;
211 }
212}
213
214static int cmp_uint64(const void * a, const void * b) {
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100215 uint64_t aa = *(const uint64_t*)a;
216 uint64_t bb = *(const uint64_t*)b;
Tim Hall79d07d22020-04-27 18:20:16 +0100217 return aa>bb ? -1 : aa<bb ? 1 : 0;
218}
219
220// Create palette from the given frequencies
221// Freq index 0-511 correspond to weights -256..255
222static void create_palette( int freq[512],
223 int use_zero_runs,
224 palette_t *p ) {
225 uint64_t freq64[512];
226 int i,all_cnt,all_max_val;
227
228 // Pair the frequency with the value so that
229 // the array can be sorted on frequency while keeping
230 // track of the corresponding palette value
231 memset(freq64, 0, sizeof(freq64));
232 all_cnt=0;
233 all_max_val=0;
234 for(i=-255; i<256; i++) {
235 if (i==0 && use_zero_runs)
236 continue;
237 int sign = i<0;
238 int mag = abs(i);
239 int palval = (mag<<1) | sign;
240
241 // Store palette value in 16 LSB bits, which will not affect the sorting
242 freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
243 all_cnt+=freq[i+256];
244
245 if (freq[i+256]>0) {
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100246 all_max_val = max(all_max_val, palval);
Tim Hall79d07d22020-04-27 18:20:16 +0100247 }
248 }
249
250 // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
251 for(i=0; i<31; i++) {
252 if ((freq64[i]>>16)!=0)
253 break;
254 }
255 p->direct_offset = i;
256
257 // Sort in descending frequency order
258 qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
259
260 // Identify special case that there are no weights to code
261 // in the weight index stream (i.e. all weights are zeros)
262 p->only_zeros = (freq64[0]>>16)==0;
263 if (p->only_zeros) {
264 p->direct_offset=0;
265 }
266
267 // Check if all weights fit into the palette (and the palette is not empty)
268 p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
269
270 int max_palette_size;
271 if (p->only_palette) {
272 max_palette_size = 32;
273 } else {
274 // For direct-lut we must make sure that the encoded weight
275 // index is not > 511. We do that by limiting the palette size
276 // such that the greatest value can be reached after subtracting
277 // the palette size.
278 max_palette_size = min(32, 511-all_max_val);
279 if (max_palette_size==1) {
280 max_palette_size=0; // because palette of size 1 is not supported
281 }
282 }
283
284 // Setup the 32 entry palette
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100285 int16_t palette_max_val = 0, val;
286 int cnt, pal_cnt=0;
Tim Hall79d07d22020-04-27 18:20:16 +0100287 for(i=0; i<max_palette_size; i++) {
Fredrik Svedberg5b513882020-12-11 13:42:22 +0100288 cnt = (int)(freq64[i]>>16);
Tim Hall79d07d22020-04-27 18:20:16 +0100289 val = freq64[i]&0xffff;
290 if ( cnt==0 )
291 break;
292 p->lut[i] = val;
293 palette_max_val = max(palette_max_val, val);
294 pal_cnt+=cnt;
295 }
296 if (i==1)
Johan Alfvénb75e2e72022-10-04 08:18:44 +0200297 p->lut[i++] = 0; // palette size of 1 is not supported, make it 2
Tim Hall79d07d22020-04-27 18:20:16 +0100298
299 // Heuristic for when to use the palette. If more than half of the
300 // weights are in the palette then we use it. This ensures we don't
301 // use palette for e.g. rectangular distributions.
302 int palbits_val;
303 if (pal_cnt > all_cnt/2) {
304 p->palsize = i;
305 palbits_val = palette_max_val;
306 } else {
307 // No palette
308 p->palsize = 0;
309 // If no palette, then palbits is used to specify the
310 // number of bits required for uncompressed mode, i.e.
311 // the number of bits for the greatest weight value
312 palbits_val = all_max_val;
313 }
314
315 // the palette entry bit width
316 // minimum 2bits (because PALBITS is in range 2..9)
317 int palbits=2;
318 while( (1<<palbits) <= palbits_val )
319 palbits++;
320 assert(palbits<=9);
321 p->palbits = palbits;
322 p->use_zero_runs = use_zero_runs;
323}
324
325// Return 1 if zero runs should be used
326// If palette_size is 512, then palette is not used (in that case the palette is setup
327// with the standard alternating unsigned to signed mapping)
328static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
329 int freq[512], i;
330
331 // Calculate frequencies of the given weight stream
332 calc_freq( inbuf, inbuf_size, freq);
333
334 // Find two most common values
335 int most_common_freq[2]={0}, most_common_val[2]={0};
336 for(i=0; i<512; i++) {
337 if ( freq[i] > most_common_freq[0] ) {
338 most_common_freq[1] = most_common_freq[0];
339 most_common_val[1] = most_common_val[0];
340 most_common_freq[0] = freq[i];
341 most_common_val[0] = i-256;
342 } else if ( freq[i] > most_common_freq[1] ) {
343 most_common_freq[1] = freq[i];
344 most_common_val[1] = i-256;
345 }
346 }
347
348 // Decide if zero-runs (alternating mode) should be used:
349 // * zero should be the most common symbol
350 // * zero should be sufficiently more common than the second most common symbol
351 int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
352
353 // Create the palette
354 create_palette( freq, use_zero_runs, p);
355
356 return use_zero_runs;
357}
358
359static void create_inverse_palette( palette_t *p) {
360 int i;
361 memset( p->inv_lut, 0, sizeof(p->inv_lut));
362 for(i=0; i<512; i++) {
363 int val = i;
364 int sign = val&1;
365 int mag = val>>1;
366 int weight = sign ? -mag : mag;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100367 int index = weight+256;
368 if (index >= 0 && index < 512)
369 p->inv_lut[ index ] = i + p->palsize - p->direct_offset;
Tim Hall79d07d22020-04-27 18:20:16 +0100370 }
371 for(i=0; i<p->palsize; i++) {
372 int val = p->lut[i];
373 int sign = val&1;
374 int mag = val>>1;
375 int weight = sign ? -mag : mag;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100376 int index = weight+256;
377 assert(index >= 0 && index < 512);
378 if (index >= 0 && index < 512)
379 p->inv_lut[ index ] = i;
Tim Hall79d07d22020-04-27 18:20:16 +0100380 }
381}
382
383#define NWCFG 13
384#define NZCFG 4 // restrict search to ZDIV=0..3
385#define MAX_ZWCFG (max(NWCFG,NZCFG))
386
387// search state
388typedef struct search_state {
389 int bitcnt; // number of bits to reach this state
390 uint8_t prev_cfg; // previous grc parameter config
391} search_state_t;
392
393// (trunc<<4) | div, 0x20 means uncompressed
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100394static const uint8_t w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
395static const uint8_t z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
Tim Hall79d07d22020-04-27 18:20:16 +0100396
397
398
399// An algorithm similar to the Viterbi algorithm is used to search for a
400// good GRC parameter sequence for the given input value sequence.
401// The inval buffer can contain weights, weight indices or runs.
402// The return value is the resulting number of bitstream sections.
403static int search_grc_params( const int *inval_buf,
404 int n_inval,
405 int zrun_mode,
406 int uncompressed_bits,
407 uint8_t *grc_param_cfg,
408 int *grc_param_pos,
409 int max_grc_param_cfg,
410 int *existing_grc_param_pos,
411 int n_existing_grc_param_pos,
412 int *bitcnt )
413{
414 int n_cfg = zrun_mode ? NZCFG : NWCFG;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100415 const uint8_t *grc_params = zrun_mode ? z_grc_params : w_grc_params;
Tim Hall79d07d22020-04-27 18:20:16 +0100416 int i,j;
417
418 search_state_t *state[MAX_ZWCFG];
419 for(i=0; i<n_cfg; i++) {
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100420 CHECKED_MALLOC(state[i], sizeof(search_state_t) * (n_inval + 1));
Tim Hall79d07d22020-04-27 18:20:16 +0100421 state[i][0].bitcnt=0;
422 state[i][0].prev_cfg=i;
423 }
424
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100425 if ( i < n_cfg ) { // Memory allocation failed - clean up and exit
426 while ( i ) {
427 free(state[--i]);
428 }
429 return -1;
430 }
431
Tim Hall79d07d22020-04-27 18:20:16 +0100432 // Loop over inval_buf
433 int existing_idx=0;
434 for(i=0; i<n_inval; i++) {
435 int value = inval_buf[i];
436
437 // Best GRC parameter so far
438 int best_bitcnt=0x7fffffff, best_cfg=0;
439 for(j=0; j<n_cfg; j++) {
440 if (state[j][i].bitcnt < best_bitcnt) {
441 best_bitcnt = state[j][i].bitcnt;
442 best_cfg = j;
443 }
444 }
445
446 int cmd_cost = 40;
447 if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
448 // free transition, because the weight stream already inserted a command at this position
449 cmd_cost = 0;
450 existing_idx++;
451 }
452
453 // Loop over GRC parameters, calculate bits to code value, and then update the search state
454 for(j=0; j<n_cfg; j++) {
455 int div = grc_params[j]&15;
456 int trunc = grc_params[j]>>4;
457 int q = value>>div;
458 int bits = trunc ? min(q+1,2) + div : q+1+div;
459 if (!zrun_mode && ((trunc && q>2) || q>31))
460 bits=10000; // it's not possible to code the current value; give it a high cost
461 if (trunc==2)
462 bits=uncompressed_bits;
463
464 if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
465 // Change GRC parameters
466 state[j][i+1].prev_cfg = best_cfg;
467 state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits;
468 } else {
469 // Keep same GRC parameters
470 state[j][i+1].prev_cfg = j;
471 state[j][i+1].bitcnt = state[j][i].bitcnt + bits;
472 }
473 }
474 }
475
476
477 // Best GRC parameter
478 int best_bitcnt=0x7fffffff, best_cfg=0;
479 for(j=0; j<n_cfg; j++) {
480 if (state[j][n_inval].bitcnt < best_bitcnt) {
481 best_bitcnt = state[j][n_inval].bitcnt;
482 best_cfg = j;
483 }
484 }
485
486 int cfg = best_cfg;
487 int n_cmds=0;
488 for(i=n_inval; i>=0; i--) {
489 if (state[cfg][i].prev_cfg != cfg || i==0) {
490 n_cmds++;
491 cfg = state[cfg][i].prev_cfg;
492 }
493 }
494
495 (void)(max_grc_param_cfg);
496 assert(n_cmds<=max_grc_param_cfg);
497
498 cfg = best_cfg;
499 j=n_cmds-1;
500 int endpos=n_inval;
501 for(i=n_inval; i>=0; i--) {
502 if (state[cfg][i].prev_cfg != cfg || i==0) {
503 grc_param_cfg[j] = cfg;
504 grc_param_pos[j] = endpos;
505 j--;
506 cfg = state[cfg][i].prev_cfg;
507 endpos = i-1;
508 }
509 }
510 assert(j==-1);
511
512 for(i=0; i<n_cfg; i++) {
513 free(state[i]);
514 }
515
516 *bitcnt = best_bitcnt;
517
518 return n_cmds;
519}
520
521
522/////////////////////////////// Write to bitstream
523
524typedef struct bitbuf {
525 uint8_t *buf;
526 int buf_size; // in bytes
527 int pos; // bit pos of next bit
528 int log_symbols;
529} bitbuf_t;
530
531// size in byte
532static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
533 bb->buf = buf;
534 bb->pos = 0;
535 bb->buf_size = size;
536 bb->log_symbols = log_symbols;
537}
538
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100539static void bitbuf_putbit( bitbuf_t *bb, uint8_t bit) {
Tim Hall79d07d22020-04-27 18:20:16 +0100540 int byte_pos = bb->pos>>3;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100541 uint8_t bit_pos = bb->pos&7;
Tim Hall79d07d22020-04-27 18:20:16 +0100542 assert( byte_pos >= 0 );
543 assert( byte_pos < bb->buf_size );
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100544 bb->buf[ byte_pos ] = ((bb->buf[ byte_pos ] & ~(1U<<bit_pos)) | ((bit<<bit_pos) & 0xff)) & 0xff;
Tim Hall79d07d22020-04-27 18:20:16 +0100545 bb->pos += 1;
546}
547
548static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
549 int i;
550 if (len>0) {
551 if (bb->log_symbols)
552 printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
553 for(i=0; i<len; i++) {
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100554 bitbuf_putbit(bb, (uint8_t)((data>>i)&1));
Tim Hall79d07d22020-04-27 18:20:16 +0100555 }
556 }
557}
558
559// Return new bitpos
560static int encode_slice( const int *w_value,
561 const int *z_value,
562 int nvalues,
563 palette_t *p,
564 int new_palette,
565 int uncompressed_bits,
566 int w_cfg,
567 int z_cfg,
568 uint8_t *bitbuf,
569 int bitbuf_size,
570 int bitpos,
571 int verbose )
572{
573 int i,j;
574 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
575 bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
576 bb->pos = bitpos;
577
578 assert(nvalues<32768);
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100579 if (w_cfg < 0 || z_cfg < 0)
580 return bitpos;
Tim Hall79d07d22020-04-27 18:20:16 +0100581 // GRC parameters for this slice
582 int w_grc_div = w_grc_params[w_cfg] & 15;
583 int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1;
584 int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2;
585 int z_grc_div = z_grc_params[z_cfg] & 15;
586
587 if (w_uncompressed) {
588 w_grc_div = uncompressed_bits;
589 }
590
591 int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
592 int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
593
594 if (verbose&1) {
595 printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
596 bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
597 }
598
599 // Write slice header
600 bitbuf_put( bb, "ZDIV", 3, zdiv);
601 bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
602 bitbuf_put( bb, "WDIV", 3, wdiv);
603 bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
604 bitbuf_put( bb, "NEWPAL", 1, new_palette );
605 if (new_palette) {
606 bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
607 bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
608 bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
609 for(i=0; i<p->palsize; i++) {
610 bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
611 }
612 }
613
614 int z_nvalues = nvalues + (new_palette?1:0);
615 int w_pos=0, z_pos=0;
616 int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
617 int z_unary=0, z_q=-1, z_r=0;
618 int w_nsymbols=0, w_remain[12]={0};
619 int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
620 int z_nsymbols=0, z_remain[12]={0};
621 int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
622 int z_unary_len = z_grc_div<3 ? 12 : 8;
623 do {
624 int balance = p->use_zero_runs ? w_pos - z_pos : 0;
625 int w_enable = balance<8 && w_pos<nvalues;
626 int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
627 if (w_enable) {
628 // Encode chunk (weights)
629 j=0;
630 w_nsymbols=0;
631 w_unary0=0;
632 w_unary1=0;
633 w_unary1_len=0;
634 int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
635 while(j<max_symbols) {
636 if (w_q<0) {
637 if (w_pos<nvalues) {
638 int value = w_value[w_pos];
639 assert(value<512);
640 w_q = value>>w_grc_div;
641 w_r = value&((1<<w_grc_div)-1);
642 assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
643 } else {
644 w_q = 0;
645 w_r = -1; // don't send remainder
646 }
647 }
648 while( w_q>=0 && j<max_symbols) {
649 w_unary0 |= w_q>0 ? (1<<j) : 0;
650 if (w_q>0) {
651 w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
652 w_unary1_len++;
653 }
654 j++;
655 w_q-=2;
656 if (w_grc_trunc)
657 w_q--;
658 }
659 if (w_q<0 && w_r>=0) {
660 w_remain[w_nsymbols] = w_r;
661 w_nsymbols++;
662 w_pos++;
663 }
664 }
665 }
666
667 if (z_enable) {
668 // Encode chunk (zrun)
669 j=0;
670 z_nsymbols=0;
671 z_unary=0;
672 while(j<z_unary_len) {
673 if (z_q<0) {
674 if (z_pos<z_nvalues) {
675 int value = z_value[z_pos];
676 z_q = value>>z_grc_div;
677 z_r = value&((1<<z_grc_div)-1);
678 } else {
679 z_q = 0;
680 z_r = -1;
681 }
682 }
683 while( z_q>=0 && j<z_unary_len) {
684 z_unary |= z_q>0 ? (1<<j) : 0;
685 j++;
686 z_q--;
687 }
688 if (z_q<0 && z_r>=0) {
689 z_remain[z_nsymbols] = z_r;
690 z_nsymbols++;
691 z_pos++;
692 }
693 }
694 }
695
696 // Write chunk to bitstream
697 if (w_enable && !w_uncompressed) {
698 bitbuf_put( bb, "WUNARY0", 12, w_unary0);
699 }
700 if (z_enable) {
701 bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
702 }
703 if (w_enable && !w_uncompressed) {
704 bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
705 }
706 if (w_prev_enable) {
707 for(i=0; i<w_prev_nsymbols; i++) {
708 bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
709 }
710 }
711 if (z_prev_enable) {
712 for(i=0; i<z_prev_nsymbols; i++) {
713 bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
714 }
715 }
716 w_prev_enable = w_enable;
717 w_prev_nsymbols = w_nsymbols;
718 memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
719 z_prev_enable = z_enable;
720 z_prev_nsymbols = z_nsymbols;
721 memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
722 } while( w_prev_enable || z_prev_enable );
723
724 return bb->pos;
725}
726
Tim Hall79d07d22020-04-27 18:20:16 +0100727// return new bitpos
728static int encode_section( const int16_t *inbuf,
729 int size,
730 palette_t *p,
731 uint8_t *bitbuf,
732 int bitbuf_size,
733 int bitpos,
734 int verbose )
735{
736 int uncompressed_bits;
737
738 // Uncompressed mode can only be used if either all weights
739 // are in the palette OR if the palette is not used.
740 if (p->only_palette) {
741 // Uncompressed bits derived from palette size
742 uncompressed_bits=0;
743 while( (1<<uncompressed_bits) < p->palsize )
744 uncompressed_bits++;
745 } else if (p->palsize==0) {
746 // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
747 uncompressed_bits = p->palbits;
748 } else {
749 // Don't use uncompressed
750 uncompressed_bits = 100;
751 }
752
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100753 uint8_t *w_slice_cfg=0;
Tim Hall79d07d22020-04-27 18:20:16 +0100754 uint8_t *z_slice_cfg=0;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100755 int *w_slice_pos=0;
Tim Hall79d07d22020-04-27 18:20:16 +0100756 int *z_slice_pos=0;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100757 int *weight_values =0;
758 int *zrun_values = 0;
759 do {
760 CHECKED_MALLOC( weight_values, size*sizeof(int) );
761 CHECKED_MALLOC( zrun_values, size*sizeof(int) );
Tim Hall79d07d22020-04-27 18:20:16 +0100762
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100763 // Get weights (or weight indicies) AND zero-runs from the input weight stream.
764 int i=0, n_weights = 0, zcnt;
765 while(1) {
766 if (p->use_zero_runs) {
767 zcnt=0;
768 // Count zero run
769 // Special case: if all weights in the section are zero, we must
770 // still ensure we have one coded weight so the the slice length
771 // doesn't become 0. Therefore we skip the first zero run and code
772 // the zero explicitly as a weight value instead
773 if (!p->only_zeros || i>0) {
774 while( i<size && inbuf[i]==0) {
775 zcnt++;
776 i++;
777 }
778 }
779 zrun_values[n_weights] = zcnt;
780 }
781 if (i==size)
782 break;
783 int value = p->inv_lut[inbuf[i]+256];
784 weight_values[n_weights] = value;
785 n_weights++;
786 i++;
Tim Hall79d07d22020-04-27 18:20:16 +0100787 }
788
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100789 // Search for good GRC parameters for the weight stream
790 int n_w_slice, w_bitcnt;
791 CHECKED_MALLOC( w_slice_cfg, size );
792 CHECKED_MALLOC( w_slice_pos, size*sizeof(int) );
793 n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100794 if ( n_w_slice < 0 ) { // Memory allocation failed
795 bitpos = -1;
796 break;
797 }
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100798 if (n_weights==0)
799 n_w_slice = 0;
800
801 // Search for good GRC parameters for the zrun stream
802 int n_z_slice=0, z_bitcnt=0;
803 if (p->use_zero_runs) {
804 CHECKED_MALLOC( z_slice_cfg, size );
805 CHECKED_MALLOC( z_slice_pos, size*sizeof(int) );
806 n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100807 if ( n_z_slice < 0 ) { // Memory allocation failed
808 bitpos = -1;
809 break;
810 }
Tim Hall79d07d22020-04-27 18:20:16 +0100811 }
812
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100813 // Encode bitstream slice
814 int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
815 while(pos<n_weights || new_palette) {
816 int endpos=pos+32767; // max slice length
Tim Hall79d07d22020-04-27 18:20:16 +0100817
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100818 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
819 endpos = w_slice_pos[i_w_slice];
820 }
Tim Hall79d07d22020-04-27 18:20:16 +0100821
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100822 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
823 endpos = z_slice_pos[i_z_slice];
824 }
825
826 if (n_weights < endpos) {
827 endpos = n_weights;
828 }
829
830 // The first slice (when new_palette is 1) encodes zero runs both at the
831 // beginning and end (i.e. number of zero runs are len+1).
832 // The following slices only encode zero runs at the end (there cannot be
833 // any zeros in the beginning since they are encoded by the previous slice)
834 int len = endpos - pos;
835 int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
836 bitpos = encode_slice( weight_values+pos, zrun_buf, len,
837 p, new_palette, uncompressed_bits,
838 w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
839 bitbuf, bitbuf_size, bitpos, verbose );
840 new_palette = 0;
841
842 if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
843 i_w_slice++;
844 }
845 if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
846 i_z_slice++;
847 }
848 pos = endpos;
Tim Hall79d07d22020-04-27 18:20:16 +0100849 }
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100850 } while(false);
Tim Hall79d07d22020-04-27 18:20:16 +0100851
852 // Free temporary buffers
853 free(w_slice_cfg);
854 free(w_slice_pos);
855 if (p->use_zero_runs) {
856 free(z_slice_cfg);
857 free(z_slice_pos);
858 }
859 free(weight_values);
860 free(zrun_values);
861
862 return bitpos;
863}
864
865// Encode the given weight stream
866// inbuf uncompressed 9bit signed weights
867// inbuf_size number of weights
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200868// outbuf compressed bitstream, buffer is malloced within this function
Tim Hall79d07d22020-04-27 18:20:16 +0100869// verbose if non-zero, printf log
870// Return value is the size in bytes of the compressed output
871// Return -1 if error
872int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
873 int i;
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200874#ifndef NDEBUG
Tim Hall79d07d22020-04-27 18:20:16 +0100875 // Range check
876 for(i=0; i<inbuf_size; i++) {
877 if (inbuf[i]<-255 || inbuf[i]>255) {
878 printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
879 return -1;
880 }
881 }
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200882#endif
Tim Hall79d07d22020-04-27 18:20:16 +0100883
884 int bitbuf_size = inbuf_size*2+1024;
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200885 assert(*outbuf == NULL);
Tim Hall79d07d22020-04-27 18:20:16 +0100886 *outbuf = malloc( bitbuf_size );
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100887 if (!*outbuf)
888 { // Failed to allocate buffer
889 return -1;
890 }
Tim Hall79d07d22020-04-27 18:20:16 +0100891
892 // Analyse input data to find palette re-programming points
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100893 int *palette_restart_pos = NULL;
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100894 int n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
Tim Hall79d07d22020-04-27 18:20:16 +0100895
896 // Compress each section (using a single palette) separately
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100897 int bitpos = 0;
898 for ( i = 0; i < n_restarts && bitpos >= 0; i++ ) {
Tim Hall79d07d22020-04-27 18:20:16 +0100899 palette_t palette;
900 int pos, size;
901 pos = palette_restart_pos[i];
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100902 size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
Tim Hall79d07d22020-04-27 18:20:16 +0100903 find_palette( inbuf+pos, size, &palette);
904 create_inverse_palette( &palette);
905 bitpos = encode_section( inbuf+pos, size, &palette,
906 *outbuf, bitbuf_size, bitpos, verbose );
907 }
908
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100909 int ret = -1;
910 if ( bitpos >= 0 && n_restarts >= 0 ) { // If allocation fails bitpos or n_restarts < 0
911 // Add end of stream marker and align to 128bit
Tim Hall79d07d22020-04-27 18:20:16 +0100912 bitbuf_t bitbuf_s, *bb=&bitbuf_s;
913 bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
914 bb->pos = bitpos;
915 bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
916 bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
917
918 // Pad with 0xff until 64bit aligned
919 while( bb->pos & 127 ) {
920 bitbuf_put( bb, "PAD", 8, 0xff );
921 }
922 bitpos = bb->pos;
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100923
924 assert((bitpos&127)==0);
925 int outbuf_size = bitpos/8;
926 *outbuf = realloc(*outbuf, outbuf_size);
927 if ( *outbuf ) {
928 ret = outbuf_size;
929 }
Tim Hall79d07d22020-04-27 18:20:16 +0100930 }
Tim Hall79d07d22020-04-27 18:20:16 +0100931
932 free(palette_restart_pos);
933
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100934 return ret;
Tim Hall79d07d22020-04-27 18:20:16 +0100935}
936
937void mlw_free_outbuf( uint8_t *outbuf ) {
938 if (outbuf)
939 free(outbuf);
940}
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200941
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200942struct brick_buf_s
943{
Mauricio Briceno3e4168d2021-06-09 09:49:05 +0200944 int16_t* buf;
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200945 int* strides;
946};
947typedef struct brick_buf_s brick_buf_t;
948
949static int16_t get_brick_weight(brick_buf_t* buf, int ofm_z, int wy, int wx, int ifm_z)
950{
Mauricio Briceno3e4168d2021-06-09 09:49:05 +0200951 int16_t* p = buf->buf;
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200952
953 p += ofm_z * buf->strides[0];
954 p += wy * buf->strides[1];
955 p += wx * buf->strides[2];
956 p += ifm_z * buf->strides[3];
957
Mauricio Briceno3e4168d2021-06-09 09:49:05 +0200958 return *p;
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200959}
960
Fredrik Svedberg93d5c352021-05-11 13:51:47 +0200961static void reorder_free(int16_t* buf)
962{
963 if (buf)
964 {
965 free(buf);
966 }
967}
968
969static int16_t* reorder(
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200970 int ifm_ublock_depth,
971 int ofm_ublock_depth,
972 int ofm_depth,
973 int kernel_height,
974 int kernel_width,
975 int ifm_depth,
976 int* strides,
Mauricio Briceno3e4168d2021-06-09 09:49:05 +0200977 int16_t* inbuf,
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200978 int ofm_block_depth,
979 int is_depthwise,
980 int is_partkernel,
981 int ifm_bitdepth,
982 int decomp_h,
983 int decomp_w,
Fredrik Svedberg93d5c352021-05-11 13:51:47 +0200984 int64_t* padded_length)
Mauricio Briceno67e11f72021-05-05 12:47:28 +0200985{
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +0100986 *padded_length = -1;
Fredrik Svedberg93d5c352021-05-11 13:51:47 +0200987 /* Size unknown. Start with one page at least */
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100988 int64_t length = round_up(max(1, sizeof(int16_t)*
Fredrik Svedberg93d5c352021-05-11 13:51:47 +0200989 ofm_depth*
990 kernel_height*
991 kernel_width*
992 ifm_depth),
993 4*1024) / sizeof(int16_t);
Fredrik Svedbergdb947e42022-11-22 15:47:24 +0100994 int16_t* weights = (int16_t*)malloc(length * sizeof(int16_t));
995 if (!weights)
996 { // Alloc failed, so exit
997 return NULL;
998 }
Fredrik Svedberg93d5c352021-05-11 13:51:47 +0200999
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001000 brick_buf_t brick_buf;
1001 brick_buf.buf = inbuf;
1002 brick_buf.strides = strides;
1003
1004 int ifm_block_depth = is_partkernel || ifm_bitdepth == 16 ? 16 : 32;
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001005 int64_t weight_cnt = 0;
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001006 for (int ofm_block_z = 0; ofm_block_z < ofm_depth; ofm_block_z += ofm_block_depth)
1007 {
1008 int clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z);
1009 // IFM blocks required for the brick
1010 for (int ifm_block_z = 0; ifm_block_z < (is_depthwise ? 1 : ifm_depth); ifm_block_z += ifm_block_depth)
1011 {
1012 int clipped_ifm_block_depth;
1013 if (is_depthwise)
1014 {
1015 clipped_ifm_block_depth = ifm_ublock_depth;
1016 }
1017 else
1018 {
1019 clipped_ifm_block_depth = is_partkernel ?
1020 min(ifm_block_depth, ifm_depth - ifm_block_z) : ifm_block_depth;
1021 }
1022 // Weight decomposition
1023 // Subkernel Splitting (H)
1024 for (int subkernel_y = 0; subkernel_y < kernel_height; subkernel_y += decomp_h)
1025 {
1026 int sub_height = min(kernel_height - subkernel_y, decomp_h);
1027 // Subkernel splitting (W)
1028 for (int subkernel_x = 0; subkernel_x < kernel_width; subkernel_x += decomp_w)
1029 {
1030 int sub_width = min(kernel_width - subkernel_x, decomp_w);
1031 int subkernel_elements = sub_width * sub_height;
1032 // Part kernel first works across the kernel H/W and needs padding
1033 if (is_partkernel)
1034 {
1035 if (ifm_bitdepth == 16 && subkernel_elements % 2 != 0)
1036 {
1037 subkernel_elements = round_up(subkernel_elements, 2);
1038 }
1039 else if (ifm_bitdepth == 8 && subkernel_elements % 4 != 0)
1040 {
1041 subkernel_elements = round_up(subkernel_elements, 4);
1042 }
1043 }
1044 else if (is_depthwise)
1045 {
1046 subkernel_elements = round_up(subkernel_elements, 4);
1047 }
1048 int ifm_block_depth_outer = is_partkernel ? clipped_ifm_block_depth : 1;
1049 int ifm_block_depth_inner = is_partkernel ? 1 : clipped_ifm_block_depth;
1050 for (int ifm_ublk_outer = 0; ifm_ublk_outer < ifm_block_depth_outer; ifm_ublk_outer += ifm_ublock_depth)
1051 {
1052 // OFM Ublocks in OFM-block over depth
1053 for (int ofm_ublk = 0; ofm_ublk < clipped_ofm_block_depth; ofm_ublk += ofm_ublock_depth)
1054 {
1055 // HW Kernel element traversal - cannot be a H/W loop due to element
1056 // padding requirement on depthwise/part-kernel configurations
1057 for (int element = 0; element < subkernel_elements; element++)
1058 {
1059 int kx = element % sub_width;
1060 int ky = element / sub_width;
1061 // IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)
1062 // In case of part-kernel-first IFM Ublock traversal have already been handled
1063 // and this loop is ignored.
1064 for (int ifm_ublk_inner = 0; ifm_ublk_inner < ifm_block_depth_inner; ifm_ublk_inner += ifm_ublock_depth)
1065 {
1066 // Feed OFM ublock elements
1067 for (int ofm_ublock_z = 0; ofm_ublock_z < ofm_ublock_depth; ofm_ublock_z++)
1068 {
1069 // Source IFM ublock elements (only 1 element deep if depthwise)
1070 for (int ifm_ublock_z = 0; ifm_ublock_z < (is_depthwise ? 1 : ifm_ublock_depth); ifm_ublock_z++)
1071 {
1072 // Source position within the current subkernel
1073 int wx = subkernel_x + kx;
1074 int wy = subkernel_y + ky;
1075 // Source IFM/OFM slices
1076 int ifm_ublk = ifm_ublk_inner + ifm_ublk_outer;
1077 int ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z;
1078 int ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z;
1079 if ((ifm_z < ifm_depth) && (ofm_z < ofm_depth) && (ky < sub_height))
1080 {
1081 weights[weight_cnt] = get_brick_weight(&brick_buf, ofm_z, wy, wx, ifm_z);
1082 }
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001083 else
1084 {
1085 weights[weight_cnt] = 0;
1086 }
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001087 weight_cnt++;
Fredrik Svedbergdb947e42022-11-22 15:47:24 +01001088 if (weight_cnt == length)
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001089 {
1090 // Reallocate by doubling the buffer size as needed
Fredrik Svedbergdb947e42022-11-22 15:47:24 +01001091 length *= 2;
1092 weights = (int16_t*)realloc(weights, length * sizeof(int16_t));
1093 if (!weights)
1094 { // Realloc failed, so exit
1095 return NULL;
1096 }
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001097 }
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001098 }
1099 }
1100 }
1101 }
1102 }
1103 }
1104 }
1105 }
1106 }
1107 }
1108
Fredrik Svedbergdb947e42022-11-22 15:47:24 +01001109
1110 weights = (int16_t*)realloc(weights, weight_cnt * sizeof(int16_t));
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +01001111 if ( weights ) {
1112 *padded_length = weight_cnt;
1113 }
Fredrik Svedbergdb947e42022-11-22 15:47:24 +01001114
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001115 return weights;
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001116}
1117
1118// Reorder and encode the given weight stream
1119// Return value is the size in bytes of the compressed output
1120// Return -1 if error
1121int mlw_reorder_encode(
1122 int ifm_ublock_depth,
1123 int ofm_ublock_depth,
1124 int ofm_depth,
1125 int kernel_height,
1126 int kernel_width,
1127 int ifm_depth,
1128 int* brick_strides,
Mauricio Briceno3e4168d2021-06-09 09:49:05 +02001129 int16_t* inbuf,
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001130 int ofm_block_depth,
1131 int is_depthwise,
1132 int is_partkernel,
1133 int ifm_bitdepth,
1134 int decomp_h,
1135 int decomp_w,
1136 uint8_t **outbuf, // *outbuf must be freed by caller
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001137 int64_t* padded_length,
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001138 int verbose)
1139{
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001140 /* Reorder weights */
1141 int16_t* weights = reorder(
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001142 ifm_ublock_depth,
1143 ofm_ublock_depth,
1144 ofm_depth,
1145 kernel_height,
1146 kernel_width,
1147 ifm_depth,
1148 brick_strides,
1149 inbuf,
1150 ofm_block_depth,
1151 is_depthwise,
1152 is_partkernel,
1153 ifm_bitdepth,
1154 decomp_h,
1155 decomp_w,
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001156 padded_length);
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001157
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001158 /* Then encode */
Fredrik Svedbergc222f8c2024-01-12 15:32:53 +01001159 int output_length = -1;
Fredrik Svedberg0e938a32021-05-20 11:13:00 +02001160 if (*padded_length > 0 && *padded_length <= INT32_MAX)
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001161 {
Fredrik Svedberg0e938a32021-05-20 11:13:00 +02001162 output_length = mlw_encode(weights, (int)*padded_length, outbuf, verbose);
Fredrik Svedberg93d5c352021-05-11 13:51:47 +02001163 }
1164 reorder_free(weights);
Mauricio Briceno67e11f72021-05-05 12:47:28 +02001165
1166 return output_length;
1167}