2 * Branch/Call/Jump (BCJ) filter decoders
4 * Authors: Lasse Collin <lasse.collin@tukaani.org>
5 * Igor Pavlov <http://7-zip.org/>
7 * This file has been put into the public domain.
8 * You can do whatever you want with this file.
11 #include "xz_private.h"
14 * The rest of the file is inside this ifdef. It makes things a little more
15 * convenient when building without support for any BCJ filters.
20 /* Type of the BCJ filter being used */
22 BCJ_X86 = 4, /* x86 or x86-64 */
23 BCJ_POWERPC = 5, /* Big endian only */
24 BCJ_IA64 = 6, /* Big or little endian */
25 BCJ_ARM = 7, /* Little endian only */
26 BCJ_ARMTHUMB = 8, /* Little endian only */
27 BCJ_SPARC = 9 /* Big or little endian */
31 * Return value of the next filter in the chain. We need to preserve
32 * this information across calls, because we must not call the next
33 * filter anymore once it has returned XZ_STREAM_END.
37 /* True if we are operating in single-call mode. */
41 * Absolute position relative to the beginning of the uncompressed
42 * data (in a single .xz Block). We care only about the lowest 32
43 * bits so this doesn't need to be uint64_t even with big files.
47 /* x86 filter state */
48 uint32_t x86_prev_mask;
50 /* Temporary space to hold the variables from struct xz_buf */
56 /* Amount of already filtered data in the beginning of buf */
59 /* Total amount of data currently stored in buf */
63 * Buffer to hold a mix of filtered and unfiltered data. This
64 * needs to be big enough to hold Alignment + 2 * Look-ahead:
66 * Type Alignment Look-ahead
80 * This is macro used to test the most significant byte of a memory address
81 * in an x86 instruction.
83 #define bcj_x86_test_msbyte(b) ((b) == 0x00 || (b) == 0xFF)
85 static noinline_for_stack size_t XZ_FUNC bcj_x86(
86 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
88 static const bool mask_to_allowed_status[8]
89 = { true, true, true, false, true, false, false, false };
91 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
94 size_t prev_pos = (size_t)-1;
95 uint32_t prev_mask = s->x86_prev_mask;
105 for (i = 0; i < size; ++i) {
106 if ((buf[i] & 0xFE) != 0xE8)
109 prev_pos = i - prev_pos;
113 prev_mask = (prev_mask << (prev_pos - 1)) & 7;
114 if (prev_mask != 0) {
115 b = buf[i + 4 - mask_to_bit_num[prev_mask]];
116 if (!mask_to_allowed_status[prev_mask]
117 || bcj_x86_test_msbyte(b)) {
119 prev_mask = (prev_mask << 1) | 1;
127 if (bcj_x86_test_msbyte(buf[i + 4])) {
128 src = get_unaligned_le32(buf + i + 1);
130 dest = src - (s->pos + (uint32_t)i + 5);
134 j = mask_to_bit_num[prev_mask] * 8;
135 b = (uint8_t)(dest >> (24 - j));
136 if (!bcj_x86_test_msbyte(b))
139 src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
143 dest |= (uint32_t)0 - (dest & 0x01000000);
144 put_unaligned_le32(dest, buf + i + 1);
147 prev_mask = (prev_mask << 1) | 1;
151 prev_pos = i - prev_pos;
152 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
157 #ifdef XZ_DEC_POWERPC
158 static noinline_for_stack size_t XZ_FUNC bcj_powerpc(
159 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
164 for (i = 0; i + 4 <= size; i += 4) {
165 instr = get_unaligned_be32(buf + i);
166 if ((instr & 0xFC000003) == 0x48000001) {
168 instr -= s->pos + (uint32_t)i;
171 put_unaligned_be32(instr, buf + i);
180 static noinline_for_stack size_t XZ_FUNC bcj_ia64(
181 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
183 static const uint8_t branch_table[32] = {
184 0, 0, 0, 0, 0, 0, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
186 4, 4, 6, 6, 0, 0, 7, 7,
187 4, 4, 0, 0, 4, 4, 0, 0
191 * The local variables take a little bit stack space, but it's less
192 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
193 * stack usage here without doing that for the LZMA2 decoder too.
200 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
203 /* Bitwise offset of the instruction indicated by slot */
206 /* bit_pos split into byte and bit parts */
210 /* Address part of an instruction */
213 /* Mask used to detect which instructions to convert */
216 /* 41-bit instruction stored somewhere in the lowest 48 bits */
219 /* Instruction normalized with bit_res for easier manipulation */
222 for (i = 0; i + 16 <= size; i += 16) {
223 mask = branch_table[buf[i] & 0x1F];
224 for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
225 if (((mask >> slot) & 1) == 0)
228 byte_pos = bit_pos >> 3;
229 bit_res = bit_pos & 7;
231 for (j = 0; j < 6; ++j)
232 instr |= (uint64_t)(buf[i + j + byte_pos])
235 norm = instr >> bit_res;
237 if (((norm >> 37) & 0x0F) == 0x05
238 && ((norm >> 9) & 0x07) == 0) {
239 addr = (norm >> 13) & 0x0FFFFF;
240 addr |= ((uint32_t)(norm >> 36) & 1) << 20;
242 addr -= s->pos + (uint32_t)i;
245 norm &= ~((uint64_t)0x8FFFFF << 13);
246 norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
247 norm |= (uint64_t)(addr & 0x100000)
250 instr &= (1 << bit_res) - 1;
251 instr |= norm << bit_res;
253 for (j = 0; j < 6; j++)
254 buf[i + j + byte_pos]
255 = (uint8_t)(instr >> (8 * j));
265 static noinline_for_stack size_t XZ_FUNC bcj_arm(
266 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
271 for (i = 0; i + 4 <= size; i += 4) {
272 if (buf[i + 3] == 0xEB) {
273 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
274 | ((uint32_t)buf[i + 2] << 16);
276 addr -= s->pos + (uint32_t)i + 8;
278 buf[i] = (uint8_t)addr;
279 buf[i + 1] = (uint8_t)(addr >> 8);
280 buf[i + 2] = (uint8_t)(addr >> 16);
288 #ifdef XZ_DEC_ARMTHUMB
289 static noinline_for_stack size_t XZ_FUNC bcj_armthumb(
290 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
295 for (i = 0; i + 4 <= size; i += 2) {
296 if ((buf[i + 1] & 0xF8) == 0xF0
297 && (buf[i + 3] & 0xF8) == 0xF8) {
298 addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
299 | ((uint32_t)buf[i] << 11)
300 | (((uint32_t)buf[i + 3] & 0x07) << 8)
301 | (uint32_t)buf[i + 2];
303 addr -= s->pos + (uint32_t)i + 4;
305 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
306 buf[i] = (uint8_t)(addr >> 11);
307 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
308 buf[i + 2] = (uint8_t)addr;
318 static noinline_for_stack size_t XZ_FUNC bcj_sparc(
319 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
324 for (i = 0; i + 4 <= size; i += 4) {
325 instr = get_unaligned_be32(buf + i);
326 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
328 instr -= s->pos + (uint32_t)i;
330 instr = ((uint32_t)0x40000000 - (instr & 0x400000))
331 | 0x40000000 | (instr & 0x3FFFFF);
332 put_unaligned_be32(instr, buf + i);
341 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
342 * of data that got filtered.
344 * NOTE: This is implemented as a switch statement to avoid using function
345 * pointers, which could be problematic in the kernel boot code, which must
346 * avoid pointers to static data (at least on x86).
348 static void XZ_FUNC bcj_apply(struct xz_dec_bcj *s,
349 uint8_t *buf, size_t *pos, size_t size)
359 filtered = bcj_x86(s, buf, size);
362 #ifdef XZ_DEC_POWERPC
364 filtered = bcj_powerpc(s, buf, size);
369 filtered = bcj_ia64(s, buf, size);
374 filtered = bcj_arm(s, buf, size);
377 #ifdef XZ_DEC_ARMTHUMB
379 filtered = bcj_armthumb(s, buf, size);
384 filtered = bcj_sparc(s, buf, size);
388 /* Never reached but silence compiler warnings. */
398 * Flush pending filtered data from temp to the output buffer.
399 * Move the remaining mixture of possibly filtered and unfiltered
400 * data to the beginning of temp.
402 static void XZ_FUNC bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
406 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
407 memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
408 b->out_pos += copy_size;
410 s->temp.filtered -= copy_size;
411 s->temp.size -= copy_size;
412 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
416 * The BCJ filter functions are primitive in sense that they process the
417 * data in chunks of 1-16 bytes. To hide this issue, this function does
420 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_run(struct xz_dec_bcj *s,
421 struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
426 * Flush pending already filtered data to the output buffer. Return
427 * immediatelly if we couldn't flush everything, or if the next
428 * filter in the chain had already returned XZ_STREAM_END.
430 if (s->temp.filtered > 0) {
432 if (s->temp.filtered > 0)
435 if (s->ret == XZ_STREAM_END)
436 return XZ_STREAM_END;
440 * If we have more output space than what is currently pending in
441 * temp, copy the unfiltered data from temp to the output buffer
442 * and try to fill the output buffer by decoding more data from the
443 * next filter in the chain. Apply the BCJ filter on the new data
444 * in the output buffer. If everything cannot be filtered, copy it
445 * to temp and rewind the output buffer position accordingly.
447 if (s->temp.size < b->out_size - b->out_pos) {
448 out_start = b->out_pos;
449 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
450 b->out_pos += s->temp.size;
452 s->ret = xz_dec_lzma2_run(lzma2, b);
453 if (s->ret != XZ_STREAM_END
454 && (s->ret != XZ_OK || s->single_call))
457 bcj_apply(s, b->out, &out_start, b->out_pos);
460 * As an exception, if the next filter returned XZ_STREAM_END,
461 * we can do that too, since the last few bytes that remain
462 * unfiltered are meant to remain unfiltered.
464 if (s->ret == XZ_STREAM_END)
465 return XZ_STREAM_END;
467 s->temp.size = b->out_pos - out_start;
468 b->out_pos -= s->temp.size;
469 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
473 * If we have unfiltered data in temp, try to fill by decoding more
474 * data from the next filter. Apply the BCJ filter on temp. Then we
475 * hopefully can fill the actual output buffer by copying filtered
476 * data from temp. A mix of filtered and unfiltered data may be left
477 * in temp; it will be taken care on the next call to this function.
479 if (s->temp.size > 0) {
480 /* Make b->out{,_pos,_size} temporarily point to s->temp. */
482 s->out_pos = b->out_pos;
483 s->out_size = b->out_size;
484 b->out = s->temp.buf;
485 b->out_pos = s->temp.size;
486 b->out_size = sizeof(s->temp.buf);
488 s->ret = xz_dec_lzma2_run(lzma2, b);
490 s->temp.size = b->out_pos;
492 b->out_pos = s->out_pos;
493 b->out_size = s->out_size;
495 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
498 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
501 * If the next filter returned XZ_STREAM_END, we mark that
502 * everything is filtered, since the last unfiltered bytes
503 * of the stream are meant to be left as is.
505 if (s->ret == XZ_STREAM_END)
506 s->temp.filtered = s->temp.size;
509 if (s->temp.filtered > 0)
516 XZ_EXTERN struct xz_dec_bcj * XZ_FUNC xz_dec_bcj_create(bool single_call)
518 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
520 s->single_call = single_call;
525 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_reset(
526 struct xz_dec_bcj *s, uint8_t id)
532 #ifdef XZ_DEC_POWERPC
541 #ifdef XZ_DEC_ARMTHUMB
550 /* Unsupported Filter ID */
551 return XZ_OPTIONS_ERROR;
557 s->x86_prev_mask = 0;
558 s->temp.filtered = 0;