new upstream release (3.3.0); modify package compatibility for Stretch
[ossec-hids.git] / src / external / pcre2-10.32 / src / sljit / sljitNativeX86_common.c
1 /*
2  *    Stack-less Just-In-Time compiler
3  *
4  *    Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without modification, are
7  * permitted provided that the following conditions are met:
8  *
9  *   1. Redistributions of source code must retain the above copyright notice, this list of
10  *      conditions and the following disclaimer.
11  *
12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13  *      of conditions and the following disclaimer in the documentation and/or other materials
14  *      provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28 {
29 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
30         return "x86" SLJIT_CPUINFO " ABI:fastcall";
31 #else
32         return "x86" SLJIT_CPUINFO;
33 #endif
34 }
35
36 /*
37    32b register indexes:
38      0 - EAX
39      1 - ECX
40      2 - EDX
41      3 - EBX
42      4 - ESP
43      5 - EBP
44      6 - ESI
45      7 - EDI
46 */
47
48 /*
49    64b register indexes:
50      0 - RAX
51      1 - RCX
52      2 - RDX
53      3 - RBX
54      4 - RSP
55      5 - RBP
56      6 - RSI
57      7 - RDI
58      8 - R8   - From now on REX prefix is required
59      9 - R9
60     10 - R10
61     11 - R11
62     12 - R12
63     13 - R13
64     14 - R14
65     15 - R15
66 */
67
68 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
69
70 /* Last register + 1. */
71 #define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)
72
73 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
74         0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 7, 6, 3, 4, 5
75 };
76
77 #define CHECK_EXTRA_REGS(p, w, do) \
78         if (p >= SLJIT_R3 && p <= SLJIT_S3) { \
79                 if (p <= compiler->scratches) \
80                         w = compiler->saveds_offset - ((p) - SLJIT_R2) * (sljit_sw)sizeof(sljit_sw); \
81                 else \
82                         w = compiler->locals_offset + ((p) - SLJIT_S2) * (sljit_sw)sizeof(sljit_sw); \
83                 p = SLJIT_MEM1(SLJIT_SP); \
84                 do; \
85         }
86
87 #else /* SLJIT_CONFIG_X86_32 */
88
89 /* Last register + 1. */
90 #define TMP_REG1        (SLJIT_NUMBER_OF_REGISTERS + 2)
91 #define TMP_REG2        (SLJIT_NUMBER_OF_REGISTERS + 3)
92
93 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
94    Note: avoid to use r12 and r13 for memory addessing
95    therefore r12 is better to be a higher saved register. */
96 #ifndef _WIN64
97 /* Args: rdi(=7), rsi(=6), rdx(=2), rcx(=1), r8, r9. Scratches: rax(=0), r10, r11 */
98 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
99         0, 0, 6, 7, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 9
100 };
101 /* low-map. reg_map & 0x7. */
102 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
103         0, 0, 6, 7, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 1
104 };
105 #else
106 /* Args: rcx(=1), rdx(=2), r8, r9. Scratches: rax(=0), r10, r11 */
107 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 4] = {
108         0, 0, 2, 8, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 9, 10
109 };
110 /* low-map. reg_map & 0x7. */
111 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 4] = {
112         0, 0, 2, 0, 1,  3,  4, 5,  5,  6,  7, 7, 6, 3, 4, 1,  2
113 };
114 #endif
115
116 /* Args: xmm0-xmm3 */
117 static const sljit_u8 freg_map[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
118         4, 0, 1, 2, 3, 5, 6
119 };
120 /* low-map. freg_map & 0x7. */
121 static const sljit_u8 freg_lmap[SLJIT_NUMBER_OF_FLOAT_REGISTERS + 1] = {
122         4, 0, 1, 2, 3, 5, 6
123 };
124
125 #define REX_W           0x48
126 #define REX_R           0x44
127 #define REX_X           0x42
128 #define REX_B           0x41
129 #define REX             0x40
130
131 #ifndef _WIN64
132 #define HALFWORD_MAX 0x7fffffffl
133 #define HALFWORD_MIN -0x80000000l
134 #else
135 #define HALFWORD_MAX 0x7fffffffll
136 #define HALFWORD_MIN -0x80000000ll
137 #endif
138
139 #define IS_HALFWORD(x)          ((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
140 #define NOT_HALFWORD(x)         ((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
141
142 #define CHECK_EXTRA_REGS(p, w, do)
143
144 #endif /* SLJIT_CONFIG_X86_32 */
145
146 #define TMP_FREG        (0)
147
148 /* Size flags for emit_x86_instruction: */
149 #define EX86_BIN_INS            0x0010
150 #define EX86_SHIFT_INS          0x0020
151 #define EX86_REX                0x0040
152 #define EX86_NO_REXW            0x0080
153 #define EX86_BYTE_ARG           0x0100
154 #define EX86_HALF_ARG           0x0200
155 #define EX86_PREF_66            0x0400
156 #define EX86_PREF_F2            0x0800
157 #define EX86_PREF_F3            0x1000
158 #define EX86_SSE2_OP1           0x2000
159 #define EX86_SSE2_OP2           0x4000
160 #define EX86_SSE2               (EX86_SSE2_OP1 | EX86_SSE2_OP2)
161
162 /* --------------------------------------------------------------------- */
163 /*  Instrucion forms                                                     */
164 /* --------------------------------------------------------------------- */
165
166 #define ADD             (/* BINARY */ 0 << 3)
167 #define ADD_EAX_i32     0x05
168 #define ADD_r_rm        0x03
169 #define ADD_rm_r        0x01
170 #define ADDSD_x_xm      0x58
171 #define ADC             (/* BINARY */ 2 << 3)
172 #define ADC_EAX_i32     0x15
173 #define ADC_r_rm        0x13
174 #define ADC_rm_r        0x11
175 #define AND             (/* BINARY */ 4 << 3)
176 #define AND_EAX_i32     0x25
177 #define AND_r_rm        0x23
178 #define AND_rm_r        0x21
179 #define ANDPD_x_xm      0x54
180 #define BSR_r_rm        (/* GROUP_0F */ 0xbd)
181 #define CALL_i32        0xe8
182 #define CALL_rm         (/* GROUP_FF */ 2 << 3)
183 #define CDQ             0x99
184 #define CMOVE_r_rm      (/* GROUP_0F */ 0x44)
185 #define CMP             (/* BINARY */ 7 << 3)
186 #define CMP_EAX_i32     0x3d
187 #define CMP_r_rm        0x3b
188 #define CMP_rm_r        0x39
189 #define CVTPD2PS_x_xm   0x5a
190 #define CVTSI2SD_x_rm   0x2a
191 #define CVTTSD2SI_r_xm  0x2c
192 #define DIV             (/* GROUP_F7 */ 6 << 3)
193 #define DIVSD_x_xm      0x5e
194 #define FSTPS           0xd9
195 #define FSTPD           0xdd
196 #define INT3            0xcc
197 #define IDIV            (/* GROUP_F7 */ 7 << 3)
198 #define IMUL            (/* GROUP_F7 */ 5 << 3)
199 #define IMUL_r_rm       (/* GROUP_0F */ 0xaf)
200 #define IMUL_r_rm_i8    0x6b
201 #define IMUL_r_rm_i32   0x69
202 #define JE_i8           0x74
203 #define JNE_i8          0x75
204 #define JMP_i8          0xeb
205 #define JMP_i32         0xe9
206 #define JMP_rm          (/* GROUP_FF */ 4 << 3)
207 #define LEA_r_m         0x8d
208 #define MOV_r_rm        0x8b
209 #define MOV_r_i32       0xb8
210 #define MOV_rm_r        0x89
211 #define MOV_rm_i32      0xc7
212 #define MOV_rm8_i8      0xc6
213 #define MOV_rm8_r8      0x88
214 #define MOVSD_x_xm      0x10
215 #define MOVSD_xm_x      0x11
216 #define MOVSXD_r_rm     0x63
217 #define MOVSX_r_rm8     (/* GROUP_0F */ 0xbe)
218 #define MOVSX_r_rm16    (/* GROUP_0F */ 0xbf)
219 #define MOVZX_r_rm8     (/* GROUP_0F */ 0xb6)
220 #define MOVZX_r_rm16    (/* GROUP_0F */ 0xb7)
221 #define MUL             (/* GROUP_F7 */ 4 << 3)
222 #define MULSD_x_xm      0x59
223 #define NEG_rm          (/* GROUP_F7 */ 3 << 3)
224 #define NOP             0x90
225 #define NOT_rm          (/* GROUP_F7 */ 2 << 3)
226 #define OR              (/* BINARY */ 1 << 3)
227 #define OR_r_rm         0x0b
228 #define OR_EAX_i32      0x0d
229 #define OR_rm_r         0x09
230 #define OR_rm8_r8       0x08
231 #define POP_r           0x58
232 #define POP_rm          0x8f
233 #define POPF            0x9d
234 #define PREFETCH        0x18
235 #define PUSH_i32        0x68
236 #define PUSH_r          0x50
237 #define PUSH_rm         (/* GROUP_FF */ 6 << 3)
238 #define PUSHF           0x9c
239 #define RET_near        0xc3
240 #define RET_i16         0xc2
241 #define SBB             (/* BINARY */ 3 << 3)
242 #define SBB_EAX_i32     0x1d
243 #define SBB_r_rm        0x1b
244 #define SBB_rm_r        0x19
245 #define SAR             (/* SHIFT */ 7 << 3)
246 #define SHL             (/* SHIFT */ 4 << 3)
247 #define SHR             (/* SHIFT */ 5 << 3)
248 #define SUB             (/* BINARY */ 5 << 3)
249 #define SUB_EAX_i32     0x2d
250 #define SUB_r_rm        0x2b
251 #define SUB_rm_r        0x29
252 #define SUBSD_x_xm      0x5c
253 #define TEST_EAX_i32    0xa9
254 #define TEST_rm_r       0x85
255 #define UCOMISD_x_xm    0x2e
256 #define UNPCKLPD_x_xm   0x14
257 #define XCHG_EAX_r      0x90
258 #define XCHG_r_rm       0x87
259 #define XOR             (/* BINARY */ 6 << 3)
260 #define XOR_EAX_i32     0x35
261 #define XOR_r_rm        0x33
262 #define XOR_rm_r        0x31
263 #define XORPD_x_xm      0x57
264
265 #define GROUP_0F        0x0f
266 #define GROUP_F7        0xf7
267 #define GROUP_FF        0xff
268 #define GROUP_BINARY_81 0x81
269 #define GROUP_BINARY_83 0x83
270 #define GROUP_SHIFT_1   0xd1
271 #define GROUP_SHIFT_N   0xc1
272 #define GROUP_SHIFT_CL  0xd3
273
274 #define MOD_REG         0xc0
275 #define MOD_DISP8       0x40
276
277 #define INC_SIZE(s)                     (*inst++ = (s), compiler->size += (s))
278
279 #define PUSH_REG(r)                     (*inst++ = (PUSH_r + (r)))
280 #define POP_REG(r)                      (*inst++ = (POP_r + (r)))
281 #define RET()                           (*inst++ = (RET_near))
282 #define RET_I16(n)                      (*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
283 /* r32, r/m32 */
284 #define MOV_RM(mod, reg, rm)            (*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
285
286 /* Multithreading does not affect these static variables, since they store
287    built-in CPU features. Therefore they can be overwritten by different threads
288    if they detect the CPU features in the same time. */
289 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
290 static sljit_s32 cpu_has_sse2 = -1;
291 #endif
292 static sljit_s32 cpu_has_cmov = -1;
293
294 #ifdef _WIN32_WCE
295 #include <cmnintrin.h>
296 #elif defined(_MSC_VER) && _MSC_VER >= 1400
297 #include <intrin.h>
298 #endif
299
300 /******************************************************/
301 /*    Unaligned-store functions                       */
302 /******************************************************/
303
304 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
305 {
306         SLJIT_MEMCPY(addr, &value, sizeof(value));
307 }
308
309 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
310 {
311         SLJIT_MEMCPY(addr, &value, sizeof(value));
312 }
313
314 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
315 {
316         SLJIT_MEMCPY(addr, &value, sizeof(value));
317 }
318
319 /******************************************************/
320 /*    Utility functions                               */
321 /******************************************************/
322
323 static void get_cpu_features(void)
324 {
325         sljit_u32 features;
326
327 #if defined(_MSC_VER) && _MSC_VER >= 1400
328
329         int CPUInfo[4];
330         __cpuid(CPUInfo, 1);
331         features = (sljit_u32)CPUInfo[3];
332
333 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
334
335         /* AT&T syntax. */
336         __asm__ (
337                 "movl $0x1, %%eax\n"
338 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
339                 /* On x86-32, there is no red zone, so this
340                    should work (no need for a local variable). */
341                 "push %%ebx\n"
342 #endif
343                 "cpuid\n"
344 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
345                 "pop %%ebx\n"
346 #endif
347                 "movl %%edx, %0\n"
348                 : "=g" (features)
349                 :
350 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
351                 : "%eax", "%ecx", "%edx"
352 #else
353                 : "%rax", "%rbx", "%rcx", "%rdx"
354 #endif
355         );
356
357 #else /* _MSC_VER && _MSC_VER >= 1400 */
358
359         /* Intel syntax. */
360         __asm {
361                 mov eax, 1
362                 cpuid
363                 mov features, edx
364         }
365
366 #endif /* _MSC_VER && _MSC_VER >= 1400 */
367
368 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
369         cpu_has_sse2 = (features >> 26) & 0x1;
370 #endif
371         cpu_has_cmov = (features >> 15) & 0x1;
372 }
373
374 static sljit_u8 get_jump_code(sljit_s32 type)
375 {
376         switch (type) {
377         case SLJIT_EQUAL:
378         case SLJIT_EQUAL_F64:
379                 return 0x84 /* je */;
380
381         case SLJIT_NOT_EQUAL:
382         case SLJIT_NOT_EQUAL_F64:
383                 return 0x85 /* jne */;
384
385         case SLJIT_LESS:
386         case SLJIT_LESS_F64:
387                 return 0x82 /* jc */;
388
389         case SLJIT_GREATER_EQUAL:
390         case SLJIT_GREATER_EQUAL_F64:
391                 return 0x83 /* jae */;
392
393         case SLJIT_GREATER:
394         case SLJIT_GREATER_F64:
395                 return 0x87 /* jnbe */;
396
397         case SLJIT_LESS_EQUAL:
398         case SLJIT_LESS_EQUAL_F64:
399                 return 0x86 /* jbe */;
400
401         case SLJIT_SIG_LESS:
402                 return 0x8c /* jl */;
403
404         case SLJIT_SIG_GREATER_EQUAL:
405                 return 0x8d /* jnl */;
406
407         case SLJIT_SIG_GREATER:
408                 return 0x8f /* jnle */;
409
410         case SLJIT_SIG_LESS_EQUAL:
411                 return 0x8e /* jle */;
412
413         case SLJIT_OVERFLOW:
414         case SLJIT_MUL_OVERFLOW:
415                 return 0x80 /* jo */;
416
417         case SLJIT_NOT_OVERFLOW:
418         case SLJIT_MUL_NOT_OVERFLOW:
419                 return 0x81 /* jno */;
420
421         case SLJIT_UNORDERED_F64:
422                 return 0x8a /* jp */;
423
424         case SLJIT_ORDERED_F64:
425                 return 0x8b /* jpo */;
426         }
427         return 0;
428 }
429
430 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
431 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type, sljit_sw executable_offset);
432 #else
433 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
434 #endif
435
436 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type, sljit_sw executable_offset)
437 {
438         sljit_s32 short_jump;
439         sljit_uw label_addr;
440
441         if (jump->flags & JUMP_LABEL)
442                 label_addr = (sljit_uw)(code + jump->u.label->size);
443         else
444                 label_addr = jump->u.target - executable_offset;
445
446         short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
447
448 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
449         if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
450                 return generate_far_jump_code(jump, code_ptr, type);
451 #endif
452
453         if (type == SLJIT_JUMP) {
454                 if (short_jump)
455                         *code_ptr++ = JMP_i8;
456                 else
457                         *code_ptr++ = JMP_i32;
458                 jump->addr++;
459         }
460         else if (type >= SLJIT_FAST_CALL) {
461                 short_jump = 0;
462                 *code_ptr++ = CALL_i32;
463                 jump->addr++;
464         }
465         else if (short_jump) {
466                 *code_ptr++ = get_jump_code(type) - 0x10;
467                 jump->addr++;
468         }
469         else {
470                 *code_ptr++ = GROUP_0F;
471                 *code_ptr++ = get_jump_code(type);
472                 jump->addr += 2;
473         }
474
475         if (short_jump) {
476                 jump->flags |= PATCH_MB;
477                 code_ptr += sizeof(sljit_s8);
478         } else {
479                 jump->flags |= PATCH_MW;
480                 code_ptr += sizeof(sljit_s32);
481         }
482
483         return code_ptr;
484 }
485
486 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
487 {
488         struct sljit_memory_fragment *buf;
489         sljit_u8 *code;
490         sljit_u8 *code_ptr;
491         sljit_u8 *buf_ptr;
492         sljit_u8 *buf_end;
493         sljit_u8 len;
494         sljit_sw executable_offset;
495         sljit_sw jump_addr;
496
497         struct sljit_label *label;
498         struct sljit_jump *jump;
499         struct sljit_const *const_;
500
501         CHECK_ERROR_PTR();
502         CHECK_PTR(check_sljit_generate_code(compiler));
503         reverse_buf(compiler);
504
505         /* Second code generation pass. */
506         code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
507         PTR_FAIL_WITH_EXEC_IF(code);
508         buf = compiler->buf;
509
510         code_ptr = code;
511         label = compiler->labels;
512         jump = compiler->jumps;
513         const_ = compiler->consts;
514         executable_offset = SLJIT_EXEC_OFFSET(code);
515
516         do {
517                 buf_ptr = buf->memory;
518                 buf_end = buf_ptr + buf->used_size;
519                 do {
520                         len = *buf_ptr++;
521                         if (len > 0) {
522                                 /* The code is already generated. */
523                                 SLJIT_MEMCPY(code_ptr, buf_ptr, len);
524                                 code_ptr += len;
525                                 buf_ptr += len;
526                         }
527                         else {
528                                 if (*buf_ptr >= 2) {
529                                         jump->addr = (sljit_uw)code_ptr;
530                                         if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
531                                                 code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 2, executable_offset);
532                                         else {
533 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
534                                                 code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2, executable_offset);
535 #else
536                                                 code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 2);
537 #endif
538                                         }
539                                         jump = jump->next;
540                                 }
541                                 else if (*buf_ptr == 0) {
542                                         label->addr = ((sljit_uw)code_ptr) + executable_offset;
543                                         label->size = code_ptr - code;
544                                         label = label->next;
545                                 }
546                                 else { /* *buf_ptr is 1 */
547                                         const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
548                                         const_ = const_->next;
549                                 }
550                                 buf_ptr++;
551                         }
552                 } while (buf_ptr < buf_end);
553                 SLJIT_ASSERT(buf_ptr == buf_end);
554                 buf = buf->next;
555         } while (buf);
556
557         SLJIT_ASSERT(!label);
558         SLJIT_ASSERT(!jump);
559         SLJIT_ASSERT(!const_);
560
561         jump = compiler->jumps;
562         while (jump) {
563                 jump_addr = jump->addr + executable_offset;
564
565                 if (jump->flags & PATCH_MB) {
566                         SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8))) <= 127);
567                         *(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump_addr + sizeof(sljit_s8)));
568                 } else if (jump->flags & PATCH_MW) {
569                         if (jump->flags & JUMP_LABEL) {
570 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
571                                 sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_sw))));
572 #else
573                                 SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
574                                 sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump_addr + sizeof(sljit_s32))));
575 #endif
576                         }
577                         else {
578 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
579                                 sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_sw))));
580 #else
581                                 SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump_addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
582                                 sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump_addr + sizeof(sljit_s32))));
583 #endif
584                         }
585                 }
586 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
587                 else if (jump->flags & PATCH_MD)
588                         sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
589 #endif
590
591                 jump = jump->next;
592         }
593
594         /* Some space may be wasted because of short jumps. */
595         SLJIT_ASSERT(code_ptr <= code + compiler->size);
596         compiler->error = SLJIT_ERR_COMPILED;
597         compiler->executable_offset = executable_offset;
598         compiler->executable_size = code_ptr - code;
599         return (void*)(code + executable_offset);
600 }
601
602 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_has_cpu_feature(sljit_s32 feature_type)
603 {
604         switch (feature_type) {
605         case SLJIT_HAS_FPU:
606 #ifdef SLJIT_IS_FPU_AVAILABLE
607                 return SLJIT_IS_FPU_AVAILABLE;
608 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
609                 if (cpu_has_sse2 == -1)
610                         get_cpu_features();
611                 return cpu_has_sse2;
612 #else /* SLJIT_DETECT_SSE2 */
613                 return 1;
614 #endif /* SLJIT_DETECT_SSE2 */
615
616 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
617         case SLJIT_HAS_VIRTUAL_REGISTERS:
618                 return 1;
619 #endif
620
621         case SLJIT_HAS_CLZ:
622         case SLJIT_HAS_CMOV:
623                 if (cpu_has_cmov == -1)
624                         get_cpu_features();
625                 return cpu_has_cmov;
626
627         case SLJIT_HAS_SSE2:
628 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
629                 if (cpu_has_sse2 == -1)
630                         get_cpu_features();
631                 return cpu_has_sse2;
632 #else
633                 return 1;
634 #endif
635
636         default:
637                 return 0;
638         }
639 }
640
641 /* --------------------------------------------------------------------- */
642 /*  Operators                                                            */
643 /* --------------------------------------------------------------------- */
644
645 #define BINARY_OPCODE(opcode) (((opcode ## _EAX_i32) << 24) | ((opcode ## _r_rm) << 16) | ((opcode ## _rm_r) << 8) | (opcode))
646
647 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
648         sljit_u32 op_types,
649         sljit_s32 dst, sljit_sw dstw,
650         sljit_s32 src1, sljit_sw src1w,
651         sljit_s32 src2, sljit_sw src2w);
652
653 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
654         sljit_u32 op_types,
655         sljit_s32 dst, sljit_sw dstw,
656         sljit_s32 src1, sljit_sw src1w,
657         sljit_s32 src2, sljit_sw src2w);
658
659 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
660         sljit_s32 dst, sljit_sw dstw,
661         sljit_s32 src, sljit_sw srcw);
662
663 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
664         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
665
666 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
667         sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src);
668
669 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
670         sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw);
671
672 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
673 #include "sljitNativeX86_32.c"
674 #else
675 #include "sljitNativeX86_64.c"
676 #endif
677
678 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
679         sljit_s32 dst, sljit_sw dstw,
680         sljit_s32 src, sljit_sw srcw)
681 {
682         sljit_u8* inst;
683
684         SLJIT_ASSERT(dst != SLJIT_UNUSED);
685
686         if (FAST_IS_REG(src)) {
687                 inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
688                 FAIL_IF(!inst);
689                 *inst = MOV_rm_r;
690                 return SLJIT_SUCCESS;
691         }
692         if (src & SLJIT_IMM) {
693                 if (FAST_IS_REG(dst)) {
694 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
695                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
696 #else
697                         if (!compiler->mode32) {
698                                 if (NOT_HALFWORD(srcw))
699                                         return emit_load_imm64(compiler, dst, srcw);
700                         }
701                         else
702                                 return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
703 #endif
704                 }
705 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
706                 if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
707                         /* Immediate to memory move. Only SLJIT_MOV operation copies
708                            an immediate directly into memory so TMP_REG1 can be used. */
709                         FAIL_IF(emit_load_imm64(compiler, TMP_REG1, srcw));
710                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
711                         FAIL_IF(!inst);
712                         *inst = MOV_rm_r;
713                         return SLJIT_SUCCESS;
714                 }
715 #endif
716                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
717                 FAIL_IF(!inst);
718                 *inst = MOV_rm_i32;
719                 return SLJIT_SUCCESS;
720         }
721         if (FAST_IS_REG(dst)) {
722                 inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
723                 FAIL_IF(!inst);
724                 *inst = MOV_r_rm;
725                 return SLJIT_SUCCESS;
726         }
727
728         /* Memory to memory move. Only SLJIT_MOV operation copies
729            data from memory to memory so TMP_REG1 can be used. */
730         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
731         FAIL_IF(!inst);
732         *inst = MOV_r_rm;
733         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
734         FAIL_IF(!inst);
735         *inst = MOV_rm_r;
736         return SLJIT_SUCCESS;
737 }
738
739 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
740 {
741         sljit_u8 *inst;
742 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
743         sljit_s32 size;
744 #endif
745
746         CHECK_ERROR();
747         CHECK(check_sljit_emit_op0(compiler, op));
748
749         switch (GET_OPCODE(op)) {
750         case SLJIT_BREAKPOINT:
751                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
752                 FAIL_IF(!inst);
753                 INC_SIZE(1);
754                 *inst = INT3;
755                 break;
756         case SLJIT_NOP:
757                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
758                 FAIL_IF(!inst);
759                 INC_SIZE(1);
760                 *inst = NOP;
761                 break;
762         case SLJIT_LMUL_UW:
763         case SLJIT_LMUL_SW:
764         case SLJIT_DIVMOD_UW:
765         case SLJIT_DIVMOD_SW:
766         case SLJIT_DIV_UW:
767         case SLJIT_DIV_SW:
768 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
769 #ifdef _WIN64
770                 SLJIT_ASSERT(
771                         reg_map[SLJIT_R0] == 0
772                         && reg_map[SLJIT_R1] == 2
773                         && reg_map[TMP_REG1] > 7);
774 #else
775                 SLJIT_ASSERT(
776                         reg_map[SLJIT_R0] == 0
777                         && reg_map[SLJIT_R1] < 7
778                         && reg_map[TMP_REG1] == 2);
779 #endif
780                 compiler->mode32 = op & SLJIT_I32_OP;
781 #endif
782                 SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
783
784                 op = GET_OPCODE(op);
785                 if ((op | 0x2) == SLJIT_DIV_UW) {
786 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
787                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
788                         inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
789 #else
790                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
791 #endif
792                         FAIL_IF(!inst);
793                         *inst = XOR_r_rm;
794                 }
795
796                 if ((op | 0x2) == SLJIT_DIV_SW) {
797 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
798                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
799 #endif
800
801 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
802                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
803                         FAIL_IF(!inst);
804                         INC_SIZE(1);
805                         *inst = CDQ;
806 #else
807                         if (compiler->mode32) {
808                                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
809                                 FAIL_IF(!inst);
810                                 INC_SIZE(1);
811                                 *inst = CDQ;
812                         } else {
813                                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
814                                 FAIL_IF(!inst);
815                                 INC_SIZE(2);
816                                 *inst++ = REX_W;
817                                 *inst = CDQ;
818                         }
819 #endif
820                 }
821
822 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
823                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
824                 FAIL_IF(!inst);
825                 INC_SIZE(2);
826                 *inst++ = GROUP_F7;
827                 *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
828 #else
829 #ifdef _WIN64
830                 size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
831 #else
832                 size = (!compiler->mode32) ? 3 : 2;
833 #endif
834                 inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
835                 FAIL_IF(!inst);
836                 INC_SIZE(size);
837 #ifdef _WIN64
838                 if (!compiler->mode32)
839                         *inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
840                 else if (op >= SLJIT_DIVMOD_UW)
841                         *inst++ = REX_B;
842                 *inst++ = GROUP_F7;
843                 *inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
844 #else
845                 if (!compiler->mode32)
846                         *inst++ = REX_W;
847                 *inst++ = GROUP_F7;
848                 *inst = MOD_REG | reg_map[SLJIT_R1];
849 #endif
850 #endif
851                 switch (op) {
852                 case SLJIT_LMUL_UW:
853                         *inst |= MUL;
854                         break;
855                 case SLJIT_LMUL_SW:
856                         *inst |= IMUL;
857                         break;
858                 case SLJIT_DIVMOD_UW:
859                 case SLJIT_DIV_UW:
860                         *inst |= DIV;
861                         break;
862                 case SLJIT_DIVMOD_SW:
863                 case SLJIT_DIV_SW:
864                         *inst |= IDIV;
865                         break;
866                 }
867 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
868                 if (op <= SLJIT_DIVMOD_SW)
869                         EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
870 #else
871                 if (op >= SLJIT_DIV_UW)
872                         EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
873 #endif
874                 break;
875         }
876
877         return SLJIT_SUCCESS;
878 }
879
880 #define ENCODE_PREFIX(prefix) \
881         do { \
882                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
883                 FAIL_IF(!inst); \
884                 INC_SIZE(1); \
885                 *inst = (prefix); \
886         } while (0)
887
888 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
889         sljit_s32 dst, sljit_sw dstw,
890         sljit_s32 src, sljit_sw srcw)
891 {
892         sljit_u8* inst;
893         sljit_s32 dst_r;
894 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
895         sljit_s32 work_r;
896 #endif
897
898 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
899         compiler->mode32 = 0;
900 #endif
901
902         if (src & SLJIT_IMM) {
903                 if (FAST_IS_REG(dst)) {
904 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
905                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
906 #else
907                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
908                         FAIL_IF(!inst);
909                         *inst = MOV_rm_i32;
910                         return SLJIT_SUCCESS;
911 #endif
912                 }
913                 inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
914                 FAIL_IF(!inst);
915                 *inst = MOV_rm8_i8;
916                 return SLJIT_SUCCESS;
917         }
918
919         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
920
921         if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
922 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
923                 if (reg_map[src] >= 4) {
924                         SLJIT_ASSERT(dst_r == TMP_REG1);
925                         EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
926                 } else
927                         dst_r = src;
928 #else
929                 dst_r = src;
930 #endif
931         }
932 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
933         else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
934                 /* src, dst are registers. */
935                 SLJIT_ASSERT(SLOW_IS_REG(dst));
936                 if (reg_map[dst] < 4) {
937                         if (dst != src)
938                                 EMIT_MOV(compiler, dst, 0, src, 0);
939                         inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
940                         FAIL_IF(!inst);
941                         *inst++ = GROUP_0F;
942                         *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
943                 }
944                 else {
945                         if (dst != src)
946                                 EMIT_MOV(compiler, dst, 0, src, 0);
947                         if (sign) {
948                                 /* shl reg, 24 */
949                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
950                                 FAIL_IF(!inst);
951                                 *inst |= SHL;
952                                 /* sar reg, 24 */
953                                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
954                                 FAIL_IF(!inst);
955                                 *inst |= SAR;
956                         }
957                         else {
958                                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
959                                 FAIL_IF(!inst);
960                                 *(inst + 1) |= AND;
961                         }
962                 }
963                 return SLJIT_SUCCESS;
964         }
965 #endif
966         else {
967                 /* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
968                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
969                 FAIL_IF(!inst);
970                 *inst++ = GROUP_0F;
971                 *inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
972         }
973
974         if (dst & SLJIT_MEM) {
975 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
976                 if (dst_r == TMP_REG1) {
977                         /* Find a non-used register, whose reg_map[src] < 4. */
978                         if ((dst & REG_MASK) == SLJIT_R0) {
979                                 if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
980                                         work_r = SLJIT_R2;
981                                 else
982                                         work_r = SLJIT_R1;
983                         }
984                         else {
985                                 if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
986                                         work_r = SLJIT_R0;
987                                 else if ((dst & REG_MASK) == SLJIT_R1)
988                                         work_r = SLJIT_R2;
989                                 else
990                                         work_r = SLJIT_R1;
991                         }
992
993                         if (work_r == SLJIT_R0) {
994                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
995                         }
996                         else {
997                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
998                                 FAIL_IF(!inst);
999                                 *inst = XCHG_r_rm;
1000                         }
1001
1002                         inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
1003                         FAIL_IF(!inst);
1004                         *inst = MOV_rm8_r8;
1005
1006                         if (work_r == SLJIT_R0) {
1007                                 ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1008                         }
1009                         else {
1010                                 inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1011                                 FAIL_IF(!inst);
1012                                 *inst = XCHG_r_rm;
1013                         }
1014                 }
1015                 else {
1016                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1017                         FAIL_IF(!inst);
1018                         *inst = MOV_rm8_r8;
1019                 }
1020 #else
1021                 inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1022                 FAIL_IF(!inst);
1023                 *inst = MOV_rm8_r8;
1024 #endif
1025         }
1026
1027         return SLJIT_SUCCESS;
1028 }
1029
1030 static sljit_s32 emit_prefetch(struct sljit_compiler *compiler, sljit_s32 op,
1031         sljit_s32 src, sljit_sw srcw)
1032 {
1033         sljit_u8* inst;
1034
1035 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1036         compiler->mode32 = 1;
1037 #endif
1038
1039         inst = emit_x86_instruction(compiler, 2, 0, 0, src, srcw);
1040         FAIL_IF(!inst);
1041         *inst++ = GROUP_0F;
1042         *inst++ = PREFETCH;
1043
1044         if (op >= SLJIT_MOV_U8 && op <= SLJIT_MOV_S8)
1045                 *inst |= (3 << 3);
1046         else if (op >= SLJIT_MOV_U16 && op <= SLJIT_MOV_S16)
1047                 *inst |= (2 << 3);
1048         else
1049                 *inst |= (1 << 3);
1050
1051         return SLJIT_SUCCESS;
1052 }
1053
1054 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1055         sljit_s32 dst, sljit_sw dstw,
1056         sljit_s32 src, sljit_sw srcw)
1057 {
1058         sljit_u8* inst;
1059         sljit_s32 dst_r;
1060
1061 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1062         compiler->mode32 = 0;
1063 #endif
1064
1065         if (src & SLJIT_IMM) {
1066                 if (FAST_IS_REG(dst)) {
1067 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1068                         return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1069 #else
1070                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1071                         FAIL_IF(!inst);
1072                         *inst = MOV_rm_i32;
1073                         return SLJIT_SUCCESS;
1074 #endif
1075                 }
1076                 inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1077                 FAIL_IF(!inst);
1078                 *inst = MOV_rm_i32;
1079                 return SLJIT_SUCCESS;
1080         }
1081
1082         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1083
1084         if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1085                 dst_r = src;
1086         else {
1087                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1088                 FAIL_IF(!inst);
1089                 *inst++ = GROUP_0F;
1090                 *inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1091         }
1092
1093         if (dst & SLJIT_MEM) {
1094                 inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1095                 FAIL_IF(!inst);
1096                 *inst = MOV_rm_r;
1097         }
1098
1099         return SLJIT_SUCCESS;
1100 }
1101
1102 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1103         sljit_s32 dst, sljit_sw dstw,
1104         sljit_s32 src, sljit_sw srcw)
1105 {
1106         sljit_u8* inst;
1107
1108         if (dst == src && dstw == srcw) {
1109                 /* Same input and output */
1110                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1111                 FAIL_IF(!inst);
1112                 *inst++ = GROUP_F7;
1113                 *inst |= opcode;
1114                 return SLJIT_SUCCESS;
1115         }
1116
1117         if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED))
1118                 dst = TMP_REG1;
1119
1120         if (FAST_IS_REG(dst)) {
1121                 EMIT_MOV(compiler, dst, 0, src, srcw);
1122                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1123                 FAIL_IF(!inst);
1124                 *inst++ = GROUP_F7;
1125                 *inst |= opcode;
1126                 return SLJIT_SUCCESS;
1127         }
1128
1129         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1130         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1131         FAIL_IF(!inst);
1132         *inst++ = GROUP_F7;
1133         *inst |= opcode;
1134         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1135         return SLJIT_SUCCESS;
1136 }
1137
1138 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1139         sljit_s32 dst, sljit_sw dstw,
1140         sljit_s32 src, sljit_sw srcw)
1141 {
1142         sljit_u8* inst;
1143
1144         if (dst == SLJIT_UNUSED)
1145                 dst = TMP_REG1;
1146
1147         if (FAST_IS_REG(dst)) {
1148                 EMIT_MOV(compiler, dst, 0, src, srcw);
1149                 inst = emit_x86_instruction(compiler, 1, 0, 0, dst, 0);
1150                 FAIL_IF(!inst);
1151                 *inst++ = GROUP_F7;
1152                 *inst |= NOT_rm;
1153                 inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1154                 FAIL_IF(!inst);
1155                 *inst = OR_r_rm;
1156                 return SLJIT_SUCCESS;
1157         }
1158
1159         EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1160         inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1161         FAIL_IF(!inst);
1162         *inst++ = GROUP_F7;
1163         *inst |= NOT_rm;
1164         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1165         FAIL_IF(!inst);
1166         *inst = OR_r_rm;
1167         EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1168         return SLJIT_SUCCESS;
1169 }
1170
1171 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1172 static const sljit_sw emit_clz_arg = 32 + 31;
1173 #endif
1174
1175 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1176         sljit_s32 dst, sljit_sw dstw,
1177         sljit_s32 src, sljit_sw srcw)
1178 {
1179         sljit_u8* inst;
1180         sljit_s32 dst_r;
1181
1182         SLJIT_UNUSED_ARG(op_flags);
1183
1184         if (cpu_has_cmov == -1)
1185                 get_cpu_features();
1186
1187         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1188
1189         inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1190         FAIL_IF(!inst);
1191         *inst++ = GROUP_0F;
1192         *inst = BSR_r_rm;
1193
1194 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1195         if (cpu_has_cmov) {
1196                 if (dst_r != TMP_REG1) {
1197                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 32 + 31);
1198                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1199                 }
1200                 else
1201                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, SLJIT_MEM0(), (sljit_sw)&emit_clz_arg);
1202
1203                 FAIL_IF(!inst);
1204                 *inst++ = GROUP_0F;
1205                 *inst = CMOVE_r_rm;
1206         }
1207         else
1208                 FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, 32 + 31));
1209
1210         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1211 #else
1212         if (cpu_has_cmov) {
1213                 EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31));
1214
1215                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1216                 FAIL_IF(!inst);
1217                 *inst++ = GROUP_0F;
1218                 *inst = CMOVE_r_rm;
1219         }
1220         else
1221                 FAIL_IF(sljit_emit_cmov_generic(compiler, SLJIT_EQUAL, dst_r, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? (64 + 63) : (32 + 31)));
1222
1223         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1224 #endif
1225
1226         FAIL_IF(!inst);
1227         *(inst + 1) |= XOR;
1228
1229         if (dst & SLJIT_MEM)
1230                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1231         return SLJIT_SUCCESS;
1232 }
1233
1234 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1235         sljit_s32 dst, sljit_sw dstw,
1236         sljit_s32 src, sljit_sw srcw)
1237 {
1238         sljit_s32 op_flags = GET_ALL_FLAGS(op);
1239 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1240         sljit_s32 dst_is_ereg = 0;
1241 #endif
1242
1243         CHECK_ERROR();
1244         CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1245         ADJUST_LOCAL_OFFSET(dst, dstw);
1246         ADJUST_LOCAL_OFFSET(src, srcw);
1247
1248         CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1249         CHECK_EXTRA_REGS(src, srcw, (void)0);
1250 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1251         compiler->mode32 = op_flags & SLJIT_I32_OP;
1252 #endif
1253
1254         if (dst == SLJIT_UNUSED && !HAS_FLAGS(op)) {
1255                 if (op <= SLJIT_MOV_P && (src & SLJIT_MEM))
1256                         return emit_prefetch(compiler, op, src, srcw);
1257                 return SLJIT_SUCCESS;
1258         }
1259
1260         op = GET_OPCODE(op);
1261
1262         if (op >= SLJIT_MOV && op <= SLJIT_MOV_P) {
1263 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1264                 compiler->mode32 = 0;
1265 #endif
1266
1267                 if (FAST_IS_REG(src) && src == dst) {
1268                         if (!TYPE_CAST_NEEDED(op))
1269                                 return SLJIT_SUCCESS;
1270                 }
1271
1272                 if (op_flags & SLJIT_I32_OP) {
1273 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1274                         if (src & SLJIT_MEM) {
1275                                 if (op == SLJIT_MOV_S32)
1276                                         op = SLJIT_MOV_U32;
1277                         }
1278                         else if (src & SLJIT_IMM) {
1279                                 if (op == SLJIT_MOV_U32)
1280                                         op = SLJIT_MOV_S32;
1281                         }
1282 #endif
1283                 }
1284
1285                 if (src & SLJIT_IMM) {
1286                         switch (op) {
1287                         case SLJIT_MOV_U8:
1288                                 srcw = (sljit_u8)srcw;
1289                                 break;
1290                         case SLJIT_MOV_S8:
1291                                 srcw = (sljit_s8)srcw;
1292                                 break;
1293                         case SLJIT_MOV_U16:
1294                                 srcw = (sljit_u16)srcw;
1295                                 break;
1296                         case SLJIT_MOV_S16:
1297                                 srcw = (sljit_s16)srcw;
1298                                 break;
1299 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1300                         case SLJIT_MOV_U32:
1301                                 srcw = (sljit_u32)srcw;
1302                                 break;
1303                         case SLJIT_MOV_S32:
1304                                 srcw = (sljit_s32)srcw;
1305                                 break;
1306 #endif
1307                         }
1308 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1309                         if (SLJIT_UNLIKELY(dst_is_ereg))
1310                                 return emit_mov(compiler, dst, dstw, src, srcw);
1311 #endif
1312                 }
1313
1314 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1315                 if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1316                         SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1317                         dst = TMP_REG1;
1318                 }
1319 #endif
1320
1321                 switch (op) {
1322                 case SLJIT_MOV:
1323                 case SLJIT_MOV_P:
1324 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1325                 case SLJIT_MOV_U32:
1326                 case SLJIT_MOV_S32:
1327 #endif
1328                         FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1329                         break;
1330                 case SLJIT_MOV_U8:
1331                         FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1332                         break;
1333                 case SLJIT_MOV_S8:
1334                         FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1335                         break;
1336                 case SLJIT_MOV_U16:
1337                         FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1338                         break;
1339                 case SLJIT_MOV_S16:
1340                         FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1341                         break;
1342 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1343                 case SLJIT_MOV_U32:
1344                         FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1345                         break;
1346                 case SLJIT_MOV_S32:
1347                         FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1348                         break;
1349 #endif
1350                 }
1351
1352 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1353                 if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1354                         return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1355 #endif
1356                 return SLJIT_SUCCESS;
1357         }
1358
1359         switch (op) {
1360         case SLJIT_NOT:
1361                 if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_Z))
1362                         return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1363                 return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1364
1365         case SLJIT_NEG:
1366                 return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1367
1368         case SLJIT_CLZ:
1369                 return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1370         }
1371
1372         return SLJIT_SUCCESS;
1373 }
1374
1375 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1376
1377 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1378         if (IS_HALFWORD(immw) || compiler->mode32) { \
1379                 inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1380                 FAIL_IF(!inst); \
1381                 *(inst + 1) |= (op_imm); \
1382         } \
1383         else { \
1384                 FAIL_IF(emit_load_imm64(compiler, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, immw)); \
1385                 inst = emit_x86_instruction(compiler, 1, (arg == TMP_REG1) ? TMP_REG2 : TMP_REG1, 0, arg, argw); \
1386                 FAIL_IF(!inst); \
1387                 *inst = (op_mr); \
1388         }
1389
1390 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1391         FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1392
1393 #else
1394
1395 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1396         inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1397         FAIL_IF(!inst); \
1398         *(inst + 1) |= (op_imm);
1399
1400 #define BINARY_EAX_IMM(op_eax_imm, immw) \
1401         FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1402
1403 #endif
1404
1405 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1406         sljit_u32 op_types,
1407         sljit_s32 dst, sljit_sw dstw,
1408         sljit_s32 src1, sljit_sw src1w,
1409         sljit_s32 src2, sljit_sw src2w)
1410 {
1411         sljit_u8* inst;
1412         sljit_u8 op_eax_imm = (op_types >> 24);
1413         sljit_u8 op_rm = (op_types >> 16) & 0xff;
1414         sljit_u8 op_mr = (op_types >> 8) & 0xff;
1415         sljit_u8 op_imm = op_types & 0xff;
1416
1417         if (dst == SLJIT_UNUSED) {
1418                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1419                 if (src2 & SLJIT_IMM) {
1420                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1421                 }
1422                 else {
1423                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1424                         FAIL_IF(!inst);
1425                         *inst = op_rm;
1426                 }
1427                 return SLJIT_SUCCESS;
1428         }
1429
1430         if (dst == src1 && dstw == src1w) {
1431                 if (src2 & SLJIT_IMM) {
1432 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1433                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1434 #else
1435                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1436 #endif
1437                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1438                         }
1439                         else {
1440                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1441                         }
1442                 }
1443                 else if (FAST_IS_REG(dst)) {
1444                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1445                         FAIL_IF(!inst);
1446                         *inst = op_rm;
1447                 }
1448                 else if (FAST_IS_REG(src2)) {
1449                         /* Special exception for sljit_emit_op_flags. */
1450                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1451                         FAIL_IF(!inst);
1452                         *inst = op_mr;
1453                 }
1454                 else {
1455                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1456                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1457                         FAIL_IF(!inst);
1458                         *inst = op_mr;
1459                 }
1460                 return SLJIT_SUCCESS;
1461         }
1462
1463         /* Only for cumulative operations. */
1464         if (dst == src2 && dstw == src2w) {
1465                 if (src1 & SLJIT_IMM) {
1466 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1467                         if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1468 #else
1469                         if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1470 #endif
1471                                 BINARY_EAX_IMM(op_eax_imm, src1w);
1472                         }
1473                         else {
1474                                 BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1475                         }
1476                 }
1477                 else if (FAST_IS_REG(dst)) {
1478                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1479                         FAIL_IF(!inst);
1480                         *inst = op_rm;
1481                 }
1482                 else if (FAST_IS_REG(src1)) {
1483                         inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1484                         FAIL_IF(!inst);
1485                         *inst = op_mr;
1486                 }
1487                 else {
1488                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1489                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1490                         FAIL_IF(!inst);
1491                         *inst = op_mr;
1492                 }
1493                 return SLJIT_SUCCESS;
1494         }
1495
1496         /* General version. */
1497         if (FAST_IS_REG(dst)) {
1498                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1499                 if (src2 & SLJIT_IMM) {
1500                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1501                 }
1502                 else {
1503                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1504                         FAIL_IF(!inst);
1505                         *inst = op_rm;
1506                 }
1507         }
1508         else {
1509                 /* This version requires less memory writing. */
1510                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1511                 if (src2 & SLJIT_IMM) {
1512                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1513                 }
1514                 else {
1515                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1516                         FAIL_IF(!inst);
1517                         *inst = op_rm;
1518                 }
1519                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1520         }
1521
1522         return SLJIT_SUCCESS;
1523 }
1524
1525 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1526         sljit_u32 op_types,
1527         sljit_s32 dst, sljit_sw dstw,
1528         sljit_s32 src1, sljit_sw src1w,
1529         sljit_s32 src2, sljit_sw src2w)
1530 {
1531         sljit_u8* inst;
1532         sljit_u8 op_eax_imm = (op_types >> 24);
1533         sljit_u8 op_rm = (op_types >> 16) & 0xff;
1534         sljit_u8 op_mr = (op_types >> 8) & 0xff;
1535         sljit_u8 op_imm = op_types & 0xff;
1536
1537         if (dst == SLJIT_UNUSED) {
1538                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1539                 if (src2 & SLJIT_IMM) {
1540                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1541                 }
1542                 else {
1543                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1544                         FAIL_IF(!inst);
1545                         *inst = op_rm;
1546                 }
1547                 return SLJIT_SUCCESS;
1548         }
1549
1550         if (dst == src1 && dstw == src1w) {
1551                 if (src2 & SLJIT_IMM) {
1552 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1553                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1554 #else
1555                         if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1556 #endif
1557                                 BINARY_EAX_IMM(op_eax_imm, src2w);
1558                         }
1559                         else {
1560                                 BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1561                         }
1562                 }
1563                 else if (FAST_IS_REG(dst)) {
1564                         inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1565                         FAIL_IF(!inst);
1566                         *inst = op_rm;
1567                 }
1568                 else if (FAST_IS_REG(src2)) {
1569                         inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1570                         FAIL_IF(!inst);
1571                         *inst = op_mr;
1572                 }
1573                 else {
1574                         EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1575                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1576                         FAIL_IF(!inst);
1577                         *inst = op_mr;
1578                 }
1579                 return SLJIT_SUCCESS;
1580         }
1581
1582         /* General version. */
1583         if (FAST_IS_REG(dst) && dst != src2) {
1584                 EMIT_MOV(compiler, dst, 0, src1, src1w);
1585                 if (src2 & SLJIT_IMM) {
1586                         BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1587                 }
1588                 else {
1589                         inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1590                         FAIL_IF(!inst);
1591                         *inst = op_rm;
1592                 }
1593         }
1594         else {
1595                 /* This version requires less memory writing. */
1596                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1597                 if (src2 & SLJIT_IMM) {
1598                         BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1599                 }
1600                 else {
1601                         inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1602                         FAIL_IF(!inst);
1603                         *inst = op_rm;
1604                 }
1605                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1606         }
1607
1608         return SLJIT_SUCCESS;
1609 }
1610
1611 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1612         sljit_s32 dst, sljit_sw dstw,
1613         sljit_s32 src1, sljit_sw src1w,
1614         sljit_s32 src2, sljit_sw src2w)
1615 {
1616         sljit_u8* inst;
1617         sljit_s32 dst_r;
1618
1619         dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
1620
1621         /* Register destination. */
1622         if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1623                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1624                 FAIL_IF(!inst);
1625                 *inst++ = GROUP_0F;
1626                 *inst = IMUL_r_rm;
1627         }
1628         else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1629                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1630                 FAIL_IF(!inst);
1631                 *inst++ = GROUP_0F;
1632                 *inst = IMUL_r_rm;
1633         }
1634         else if (src1 & SLJIT_IMM) {
1635                 if (src2 & SLJIT_IMM) {
1636                         EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1637                         src2 = dst_r;
1638                         src2w = 0;
1639                 }
1640
1641                 if (src1w <= 127 && src1w >= -128) {
1642                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1643                         FAIL_IF(!inst);
1644                         *inst = IMUL_r_rm_i8;
1645                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1646                         FAIL_IF(!inst);
1647                         INC_SIZE(1);
1648                         *inst = (sljit_s8)src1w;
1649                 }
1650 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1651                 else {
1652                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1653                         FAIL_IF(!inst);
1654                         *inst = IMUL_r_rm_i32;
1655                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1656                         FAIL_IF(!inst);
1657                         INC_SIZE(4);
1658                         sljit_unaligned_store_sw(inst, src1w);
1659                 }
1660 #else
1661                 else if (IS_HALFWORD(src1w)) {
1662                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1663                         FAIL_IF(!inst);
1664                         *inst = IMUL_r_rm_i32;
1665                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1666                         FAIL_IF(!inst);
1667                         INC_SIZE(4);
1668                         sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1669                 }
1670                 else {
1671                         if (dst_r != src2)
1672                                 EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1673                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1674                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1675                         FAIL_IF(!inst);
1676                         *inst++ = GROUP_0F;
1677                         *inst = IMUL_r_rm;
1678                 }
1679 #endif
1680         }
1681         else if (src2 & SLJIT_IMM) {
1682                 /* Note: src1 is NOT immediate. */
1683
1684                 if (src2w <= 127 && src2w >= -128) {
1685                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1686                         FAIL_IF(!inst);
1687                         *inst = IMUL_r_rm_i8;
1688                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1689                         FAIL_IF(!inst);
1690                         INC_SIZE(1);
1691                         *inst = (sljit_s8)src2w;
1692                 }
1693 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1694                 else {
1695                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1696                         FAIL_IF(!inst);
1697                         *inst = IMUL_r_rm_i32;
1698                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1699                         FAIL_IF(!inst);
1700                         INC_SIZE(4);
1701                         sljit_unaligned_store_sw(inst, src2w);
1702                 }
1703 #else
1704                 else if (IS_HALFWORD(src2w)) {
1705                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1706                         FAIL_IF(!inst);
1707                         *inst = IMUL_r_rm_i32;
1708                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1709                         FAIL_IF(!inst);
1710                         INC_SIZE(4);
1711                         sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1712                 }
1713                 else {
1714                         if (dst_r != src1)
1715                                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1716                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1717                         inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1718                         FAIL_IF(!inst);
1719                         *inst++ = GROUP_0F;
1720                         *inst = IMUL_r_rm;
1721                 }
1722 #endif
1723         }
1724         else {
1725                 /* Neither argument is immediate. */
1726                 if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1727                         dst_r = TMP_REG1;
1728                 EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1729                 inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1730                 FAIL_IF(!inst);
1731                 *inst++ = GROUP_0F;
1732                 *inst = IMUL_r_rm;
1733         }
1734
1735         if (dst & SLJIT_MEM)
1736                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1737
1738         return SLJIT_SUCCESS;
1739 }
1740
1741 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler,
1742         sljit_s32 dst, sljit_sw dstw,
1743         sljit_s32 src1, sljit_sw src1w,
1744         sljit_s32 src2, sljit_sw src2w)
1745 {
1746         sljit_u8* inst;
1747         sljit_s32 dst_r, done = 0;
1748
1749         /* These cases better be left to handled by normal way. */
1750         if (dst == src1 && dstw == src1w)
1751                 return SLJIT_ERR_UNSUPPORTED;
1752         if (dst == src2 && dstw == src2w)
1753                 return SLJIT_ERR_UNSUPPORTED;
1754
1755         dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1756
1757         if (FAST_IS_REG(src1)) {
1758                 if (FAST_IS_REG(src2)) {
1759                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1760                         FAIL_IF(!inst);
1761                         *inst = LEA_r_m;
1762                         done = 1;
1763                 }
1764 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1765                 if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1766                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1767 #else
1768                 if (src2 & SLJIT_IMM) {
1769                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1770 #endif
1771                         FAIL_IF(!inst);
1772                         *inst = LEA_r_m;
1773                         done = 1;
1774                 }
1775         }
1776         else if (FAST_IS_REG(src2)) {
1777 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1778                 if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1779                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1780 #else
1781                 if (src1 & SLJIT_IMM) {
1782                         inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1783 #endif
1784                         FAIL_IF(!inst);
1785                         *inst = LEA_r_m;
1786                         done = 1;
1787                 }
1788         }
1789
1790         if (done) {
1791                 if (dst_r == TMP_REG1)
1792                         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1793                 return SLJIT_SUCCESS;
1794         }
1795         return SLJIT_ERR_UNSUPPORTED;
1796 }
1797
1798 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1799         sljit_s32 src1, sljit_sw src1w,
1800         sljit_s32 src2, sljit_sw src2w)
1801 {
1802         sljit_u8* inst;
1803
1804 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1805         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1806 #else
1807         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1808 #endif
1809                 BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1810                 return SLJIT_SUCCESS;
1811         }
1812
1813         if (FAST_IS_REG(src1)) {
1814                 if (src2 & SLJIT_IMM) {
1815                         BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1816                 }
1817                 else {
1818                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1819                         FAIL_IF(!inst);
1820                         *inst = CMP_r_rm;
1821                 }
1822                 return SLJIT_SUCCESS;
1823         }
1824
1825         if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1826                 inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1827                 FAIL_IF(!inst);
1828                 *inst = CMP_rm_r;
1829                 return SLJIT_SUCCESS;
1830         }
1831
1832         if (src2 & SLJIT_IMM) {
1833                 if (src1 & SLJIT_IMM) {
1834                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1835                         src1 = TMP_REG1;
1836                         src1w = 0;
1837                 }
1838                 BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1839         }
1840         else {
1841                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1842                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1843                 FAIL_IF(!inst);
1844                 *inst = CMP_r_rm;
1845         }
1846         return SLJIT_SUCCESS;
1847 }
1848
1849 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1850         sljit_s32 src1, sljit_sw src1w,
1851         sljit_s32 src2, sljit_sw src2w)
1852 {
1853         sljit_u8* inst;
1854
1855 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1856         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1857 #else
1858         if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1859 #endif
1860                 BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1861                 return SLJIT_SUCCESS;
1862         }
1863
1864 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1865         if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1866 #else
1867         if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1868 #endif
1869                 BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1870                 return SLJIT_SUCCESS;
1871         }
1872
1873         if (!(src1 & SLJIT_IMM)) {
1874                 if (src2 & SLJIT_IMM) {
1875 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1876                         if (IS_HALFWORD(src2w) || compiler->mode32) {
1877                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1878                                 FAIL_IF(!inst);
1879                                 *inst = GROUP_F7;
1880                         }
1881                         else {
1882                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src2w));
1883                                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src1, src1w);
1884                                 FAIL_IF(!inst);
1885                                 *inst = TEST_rm_r;
1886                         }
1887 #else
1888                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1889                         FAIL_IF(!inst);
1890                         *inst = GROUP_F7;
1891 #endif
1892                         return SLJIT_SUCCESS;
1893                 }
1894                 else if (FAST_IS_REG(src1)) {
1895                         inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1896                         FAIL_IF(!inst);
1897                         *inst = TEST_rm_r;
1898                         return SLJIT_SUCCESS;
1899                 }
1900         }
1901
1902         if (!(src2 & SLJIT_IMM)) {
1903                 if (src1 & SLJIT_IMM) {
1904 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1905                         if (IS_HALFWORD(src1w) || compiler->mode32) {
1906                                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1907                                 FAIL_IF(!inst);
1908                                 *inst = GROUP_F7;
1909                         }
1910                         else {
1911                                 FAIL_IF(emit_load_imm64(compiler, TMP_REG1, src1w));
1912                                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1913                                 FAIL_IF(!inst);
1914                                 *inst = TEST_rm_r;
1915                         }
1916 #else
1917                         inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1918                         FAIL_IF(!inst);
1919                         *inst = GROUP_F7;
1920 #endif
1921                         return SLJIT_SUCCESS;
1922                 }
1923                 else if (FAST_IS_REG(src2)) {
1924                         inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1925                         FAIL_IF(!inst);
1926                         *inst = TEST_rm_r;
1927                         return SLJIT_SUCCESS;
1928                 }
1929         }
1930
1931         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1932         if (src2 & SLJIT_IMM) {
1933 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1934                 if (IS_HALFWORD(src2w) || compiler->mode32) {
1935                         inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1936                         FAIL_IF(!inst);
1937                         *inst = GROUP_F7;
1938                 }
1939                 else {
1940                         FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1941                         inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1942                         FAIL_IF(!inst);
1943                         *inst = TEST_rm_r;
1944                 }
1945 #else
1946                 inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1947                 FAIL_IF(!inst);
1948                 *inst = GROUP_F7;
1949 #endif
1950         }
1951         else {
1952                 inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1953                 FAIL_IF(!inst);
1954                 *inst = TEST_rm_r;
1955         }
1956         return SLJIT_SUCCESS;
1957 }
1958
1959 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
1960         sljit_u8 mode,
1961         sljit_s32 dst, sljit_sw dstw,
1962         sljit_s32 src1, sljit_sw src1w,
1963         sljit_s32 src2, sljit_sw src2w)
1964 {
1965         sljit_u8* inst;
1966
1967         if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
1968                 if (dst == src1 && dstw == src1w) {
1969                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
1970                         FAIL_IF(!inst);
1971                         *inst |= mode;
1972                         return SLJIT_SUCCESS;
1973                 }
1974                 if (dst == SLJIT_UNUSED) {
1975                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1976                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
1977                         FAIL_IF(!inst);
1978                         *inst |= mode;
1979                         return SLJIT_SUCCESS;
1980                 }
1981                 if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
1982                         EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1983                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
1984                         FAIL_IF(!inst);
1985                         *inst |= mode;
1986                         EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
1987                         return SLJIT_SUCCESS;
1988                 }
1989                 if (FAST_IS_REG(dst)) {
1990                         EMIT_MOV(compiler, dst, 0, src1, src1w);
1991                         inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
1992                         FAIL_IF(!inst);
1993                         *inst |= mode;
1994                         return SLJIT_SUCCESS;
1995                 }
1996
1997                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1998                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
1999                 FAIL_IF(!inst);
2000                 *inst |= mode;
2001                 EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2002                 return SLJIT_SUCCESS;
2003         }
2004
2005         if (dst == SLJIT_PREF_SHIFT_REG) {
2006                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2007                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2008                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2009                 FAIL_IF(!inst);
2010                 *inst |= mode;
2011                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2012         }
2013         else if (SLOW_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2014                 if (src1 != dst)
2015                         EMIT_MOV(compiler, dst, 0, src1, src1w);
2016                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2017                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2018                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2019                 FAIL_IF(!inst);
2020                 *inst |= mode;
2021                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2022         }
2023         else {
2024                 /* This case is complex since ecx itself may be used for
2025                    addressing, and this case must be supported as well. */
2026                 EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2027 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2028                 EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), 0, SLJIT_PREF_SHIFT_REG, 0);
2029                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2030                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2031                 FAIL_IF(!inst);
2032                 *inst |= mode;
2033                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), 0);
2034 #else
2035                 EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2036                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2037                 inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2038                 FAIL_IF(!inst);
2039                 *inst |= mode;
2040                 EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2041 #endif
2042                 if (dst != SLJIT_UNUSED)
2043                         return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2044         }
2045
2046         return SLJIT_SUCCESS;
2047 }
2048
2049 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2050         sljit_u8 mode, sljit_s32 set_flags,
2051         sljit_s32 dst, sljit_sw dstw,
2052         sljit_s32 src1, sljit_sw src1w,
2053         sljit_s32 src2, sljit_sw src2w)
2054 {
2055         /* The CPU does not set flags if the shift count is 0. */
2056         if (src2 & SLJIT_IMM) {
2057 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2058                 if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2059                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2060 #else
2061                 if ((src2w & 0x1f) != 0)
2062                         return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2063 #endif
2064                 if (!set_flags)
2065                         return emit_mov(compiler, dst, dstw, src1, src1w);
2066                 /* OR dst, src, 0 */
2067                 return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2068                         dst, dstw, src1, src1w, SLJIT_IMM, 0);
2069         }
2070
2071         if (!set_flags)
2072                 return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2073
2074         if (!FAST_IS_REG(dst))
2075                 FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2076
2077         FAIL_IF(emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w));
2078
2079         if (FAST_IS_REG(dst))
2080                 return emit_cmp_binary(compiler, (dst == SLJIT_UNUSED) ? TMP_REG1 : dst, dstw, SLJIT_IMM, 0);
2081         return SLJIT_SUCCESS;
2082 }
2083
2084 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2085         sljit_s32 dst, sljit_sw dstw,
2086         sljit_s32 src1, sljit_sw src1w,
2087         sljit_s32 src2, sljit_sw src2w)
2088 {
2089         CHECK_ERROR();
2090         CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2091         ADJUST_LOCAL_OFFSET(dst, dstw);
2092         ADJUST_LOCAL_OFFSET(src1, src1w);
2093         ADJUST_LOCAL_OFFSET(src2, src2w);
2094
2095         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2096         CHECK_EXTRA_REGS(src1, src1w, (void)0);
2097         CHECK_EXTRA_REGS(src2, src2w, (void)0);
2098 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2099         compiler->mode32 = op & SLJIT_I32_OP;
2100 #endif
2101
2102         if (dst == SLJIT_UNUSED && !HAS_FLAGS(op))
2103                 return SLJIT_SUCCESS;
2104
2105         switch (GET_OPCODE(op)) {
2106         case SLJIT_ADD:
2107                 if (!HAS_FLAGS(op)) {
2108                         if (emit_lea_binary(compiler, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2109                                 return compiler->error;
2110                 }
2111                 return emit_cum_binary(compiler, BINARY_OPCODE(ADD),
2112                         dst, dstw, src1, src1w, src2, src2w);
2113         case SLJIT_ADDC:
2114                 return emit_cum_binary(compiler, BINARY_OPCODE(ADC),
2115                         dst, dstw, src1, src1w, src2, src2w);
2116         case SLJIT_SUB:
2117                 if (!HAS_FLAGS(op)) {
2118                         if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2119                                 return compiler->error;
2120                 }
2121
2122                 if (dst == SLJIT_UNUSED)
2123                         return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2124                 return emit_non_cum_binary(compiler, BINARY_OPCODE(SUB),
2125                         dst, dstw, src1, src1w, src2, src2w);
2126         case SLJIT_SUBC:
2127                 return emit_non_cum_binary(compiler, BINARY_OPCODE(SBB),
2128                         dst, dstw, src1, src1w, src2, src2w);
2129         case SLJIT_MUL:
2130                 return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2131         case SLJIT_AND:
2132                 if (dst == SLJIT_UNUSED)
2133                         return emit_test_binary(compiler, src1, src1w, src2, src2w);
2134                 return emit_cum_binary(compiler, BINARY_OPCODE(AND),
2135                         dst, dstw, src1, src1w, src2, src2w);
2136         case SLJIT_OR:
2137                 return emit_cum_binary(compiler, BINARY_OPCODE(OR),
2138                         dst, dstw, src1, src1w, src2, src2w);
2139         case SLJIT_XOR:
2140                 return emit_cum_binary(compiler, BINARY_OPCODE(XOR),
2141                         dst, dstw, src1, src1w, src2, src2w);
2142         case SLJIT_SHL:
2143                 return emit_shift_with_flags(compiler, SHL, HAS_FLAGS(op),
2144                         dst, dstw, src1, src1w, src2, src2w);
2145         case SLJIT_LSHR:
2146                 return emit_shift_with_flags(compiler, SHR, HAS_FLAGS(op),
2147                         dst, dstw, src1, src1w, src2, src2w);
2148         case SLJIT_ASHR:
2149                 return emit_shift_with_flags(compiler, SAR, HAS_FLAGS(op),
2150                         dst, dstw, src1, src1w, src2, src2w);
2151         }
2152
2153         return SLJIT_SUCCESS;
2154 }
2155
2156 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2157 {
2158         CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2159 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2160         if (reg >= SLJIT_R3 && reg <= SLJIT_R8)
2161                 return -1;
2162 #endif
2163         return reg_map[reg];
2164 }
2165
2166 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2167 {
2168         CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2169 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2170         return reg;
2171 #else
2172         return freg_map[reg];
2173 #endif
2174 }
2175
2176 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2177         void *instruction, sljit_s32 size)
2178 {
2179         sljit_u8 *inst;
2180
2181         CHECK_ERROR();
2182         CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2183
2184         inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2185         FAIL_IF(!inst);
2186         INC_SIZE(size);
2187         SLJIT_MEMCPY(inst, instruction, size);
2188         return SLJIT_SUCCESS;
2189 }
2190
2191 /* --------------------------------------------------------------------- */
2192 /*  Floating point operators                                             */
2193 /* --------------------------------------------------------------------- */
2194
2195 /* Alignment(3) + 4 * 16 bytes. */
2196 static sljit_s32 sse2_data[3 + (4 * 4)];
2197 static sljit_s32 *sse2_buffer;
2198
2199 static void init_compiler(void)
2200 {
2201         /* Align to 16 bytes. */
2202         sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2203
2204         /* Single precision constants (each constant is 16 byte long). */
2205         sse2_buffer[0] = 0x80000000;
2206         sse2_buffer[4] = 0x7fffffff;
2207         /* Double precision constants (each constant is 16 byte long). */
2208         sse2_buffer[8] = 0;
2209         sse2_buffer[9] = 0x80000000;
2210         sse2_buffer[12] = 0xffffffff;
2211         sse2_buffer[13] = 0x7fffffff;
2212 }
2213
2214 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2215         sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2216 {
2217         sljit_u8 *inst;
2218
2219         inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2220         FAIL_IF(!inst);
2221         *inst++ = GROUP_0F;
2222         *inst = opcode;
2223         return SLJIT_SUCCESS;
2224 }
2225
2226 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2227         sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2228 {
2229         sljit_u8 *inst;
2230
2231         inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2232         FAIL_IF(!inst);
2233         *inst++ = GROUP_0F;
2234         *inst = opcode;
2235         return SLJIT_SUCCESS;
2236 }
2237
2238 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2239         sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2240 {
2241         return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2242 }
2243
2244 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2245         sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2246 {
2247         return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2248 }
2249
2250 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2251         sljit_s32 dst, sljit_sw dstw,
2252         sljit_s32 src, sljit_sw srcw)
2253 {
2254         sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
2255         sljit_u8 *inst;
2256
2257 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2258         if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2259                 compiler->mode32 = 0;
2260 #endif
2261
2262         inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2263         FAIL_IF(!inst);
2264         *inst++ = GROUP_0F;
2265         *inst = CVTTSD2SI_r_xm;
2266
2267         if (dst & SLJIT_MEM)
2268                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2269         return SLJIT_SUCCESS;
2270 }
2271
2272 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2273         sljit_s32 dst, sljit_sw dstw,
2274         sljit_s32 src, sljit_sw srcw)
2275 {
2276         sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2277         sljit_u8 *inst;
2278
2279 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2280         if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2281                 compiler->mode32 = 0;
2282 #endif
2283
2284         if (src & SLJIT_IMM) {
2285 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2286                 if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2287                         srcw = (sljit_s32)srcw;
2288 #endif
2289                 EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2290                 src = TMP_REG1;
2291                 srcw = 0;
2292         }
2293
2294         inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2295         FAIL_IF(!inst);
2296         *inst++ = GROUP_0F;
2297         *inst = CVTSI2SD_x_rm;
2298
2299 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2300         compiler->mode32 = 1;
2301 #endif
2302         if (dst_r == TMP_FREG)
2303                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2304         return SLJIT_SUCCESS;
2305 }
2306
2307 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2308         sljit_s32 src1, sljit_sw src1w,
2309         sljit_s32 src2, sljit_sw src2w)
2310 {
2311         if (!FAST_IS_REG(src1)) {
2312                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2313                 src1 = TMP_FREG;
2314         }
2315
2316         return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2317 }
2318
2319 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2320         sljit_s32 dst, sljit_sw dstw,
2321         sljit_s32 src, sljit_sw srcw)
2322 {
2323         sljit_s32 dst_r;
2324
2325 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2326         compiler->mode32 = 1;
2327 #endif
2328
2329         CHECK_ERROR();
2330         SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2331
2332         if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2333                 if (FAST_IS_REG(dst))
2334                         return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2335                 if (FAST_IS_REG(src))
2336                         return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2337                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2338                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2339         }
2340
2341         if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2342                 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2343                 if (FAST_IS_REG(src)) {
2344                         /* We overwrite the high bits of source. From SLJIT point of view,
2345                            this is not an issue.
2346                            Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2347                         FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2348                 }
2349                 else {
2350                         FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2351                         src = TMP_FREG;
2352                 }
2353
2354                 FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2355                 if (dst_r == TMP_FREG)
2356                         return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2357                 return SLJIT_SUCCESS;
2358         }
2359
2360         if (FAST_IS_REG(dst)) {
2361                 dst_r = dst;
2362                 if (dst != src)
2363                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2364         }
2365         else {
2366                 dst_r = TMP_FREG;
2367                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2368         }
2369
2370         switch (GET_OPCODE(op)) {
2371         case SLJIT_NEG_F64:
2372                 FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2373                 break;
2374
2375         case SLJIT_ABS_F64:
2376                 FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2377                 break;
2378         }
2379
2380         if (dst_r == TMP_FREG)
2381                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2382         return SLJIT_SUCCESS;
2383 }
2384
2385 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2386         sljit_s32 dst, sljit_sw dstw,
2387         sljit_s32 src1, sljit_sw src1w,
2388         sljit_s32 src2, sljit_sw src2w)
2389 {
2390         sljit_s32 dst_r;
2391
2392         CHECK_ERROR();
2393         CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2394         ADJUST_LOCAL_OFFSET(dst, dstw);
2395         ADJUST_LOCAL_OFFSET(src1, src1w);
2396         ADJUST_LOCAL_OFFSET(src2, src2w);
2397
2398 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2399         compiler->mode32 = 1;
2400 #endif
2401
2402         if (FAST_IS_REG(dst)) {
2403                 dst_r = dst;
2404                 if (dst == src1)
2405                         ; /* Do nothing here. */
2406                 else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2407                         /* Swap arguments. */
2408                         src2 = src1;
2409                         src2w = src1w;
2410                 }
2411                 else if (dst != src2)
2412                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2413                 else {
2414                         dst_r = TMP_FREG;
2415                         FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2416                 }
2417         }
2418         else {
2419                 dst_r = TMP_FREG;
2420                 FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2421         }
2422
2423         switch (GET_OPCODE(op)) {
2424         case SLJIT_ADD_F64:
2425                 FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2426                 break;
2427
2428         case SLJIT_SUB_F64:
2429                 FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2430                 break;
2431
2432         case SLJIT_MUL_F64:
2433                 FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2434                 break;
2435
2436         case SLJIT_DIV_F64:
2437                 FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2438                 break;
2439         }
2440
2441         if (dst_r == TMP_FREG)
2442                 return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2443         return SLJIT_SUCCESS;
2444 }
2445
2446 /* --------------------------------------------------------------------- */
2447 /*  Conditional instructions                                             */
2448 /* --------------------------------------------------------------------- */
2449
2450 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2451 {
2452         sljit_u8 *inst;
2453         struct sljit_label *label;
2454
2455         CHECK_ERROR_PTR();
2456         CHECK_PTR(check_sljit_emit_label(compiler));
2457
2458         if (compiler->last_label && compiler->last_label->size == compiler->size)
2459                 return compiler->last_label;
2460
2461         label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2462         PTR_FAIL_IF(!label);
2463         set_label(label, compiler);
2464
2465         inst = (sljit_u8*)ensure_buf(compiler, 2);
2466         PTR_FAIL_IF(!inst);
2467
2468         *inst++ = 0;
2469         *inst++ = 0;
2470
2471         return label;
2472 }
2473
2474 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2475 {
2476         sljit_u8 *inst;
2477         struct sljit_jump *jump;
2478
2479         CHECK_ERROR_PTR();
2480         CHECK_PTR(check_sljit_emit_jump(compiler, type));
2481
2482         jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2483         PTR_FAIL_IF_NULL(jump);
2484         set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2485         type &= 0xff;
2486
2487         /* Worst case size. */
2488 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2489         compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2490 #else
2491         compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2492 #endif
2493
2494         inst = (sljit_u8*)ensure_buf(compiler, 2);
2495         PTR_FAIL_IF_NULL(inst);
2496
2497         *inst++ = 0;
2498         *inst++ = type + 2;
2499         return jump;
2500 }
2501
2502 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2503 {
2504         sljit_u8 *inst;
2505         struct sljit_jump *jump;
2506
2507         CHECK_ERROR();
2508         CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2509         ADJUST_LOCAL_OFFSET(src, srcw);
2510
2511         CHECK_EXTRA_REGS(src, srcw, (void)0);
2512
2513         if (src == SLJIT_IMM) {
2514                 jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2515                 FAIL_IF_NULL(jump);
2516                 set_jump(jump, compiler, JUMP_ADDR);
2517                 jump->u.target = srcw;
2518
2519                 /* Worst case size. */
2520 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2521                 compiler->size += 5;
2522 #else
2523                 compiler->size += 10 + 3;
2524 #endif
2525
2526                 inst = (sljit_u8*)ensure_buf(compiler, 2);
2527                 FAIL_IF_NULL(inst);
2528
2529                 *inst++ = 0;
2530                 *inst++ = type + 2;
2531         }
2532         else {
2533 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2534                 /* REX_W is not necessary (src is not immediate). */
2535                 compiler->mode32 = 1;
2536 #endif
2537                 inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2538                 FAIL_IF(!inst);
2539                 *inst++ = GROUP_FF;
2540                 *inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2541         }
2542         return SLJIT_SUCCESS;
2543 }
2544
2545 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2546         sljit_s32 dst, sljit_sw dstw,
2547         sljit_s32 type)
2548 {
2549         sljit_u8 *inst;
2550         sljit_u8 cond_set = 0;
2551 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2552         sljit_s32 reg;
2553 #endif
2554         /* ADJUST_LOCAL_OFFSET and CHECK_EXTRA_REGS might overwrite these values. */
2555         sljit_s32 dst_save = dst;
2556         sljit_sw dstw_save = dstw;
2557
2558         CHECK_ERROR();
2559         CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, type));
2560
2561         ADJUST_LOCAL_OFFSET(dst, dstw);
2562         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2563
2564         type &= 0xff;
2565         /* setcc = jcc + 0x10. */
2566         cond_set = get_jump_code(type) + 0x10;
2567
2568 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2569         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst)) {
2570                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2571                 FAIL_IF(!inst);
2572                 INC_SIZE(4 + 3);
2573                 /* Set low register to conditional flag. */
2574                 *inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2575                 *inst++ = GROUP_0F;
2576                 *inst++ = cond_set;
2577                 *inst++ = MOD_REG | reg_lmap[TMP_REG1];
2578                 *inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2579                 *inst++ = OR_rm8_r8;
2580                 *inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2581                 return SLJIT_SUCCESS;
2582         }
2583
2584         reg = (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2585
2586         inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2587         FAIL_IF(!inst);
2588         INC_SIZE(4 + 4);
2589         /* Set low register to conditional flag. */
2590         *inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2591         *inst++ = GROUP_0F;
2592         *inst++ = cond_set;
2593         *inst++ = MOD_REG | reg_lmap[reg];
2594         *inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2595         /* The movzx instruction does not affect flags. */
2596         *inst++ = GROUP_0F;
2597         *inst++ = MOVZX_r_rm8;
2598         *inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2599
2600         if (reg != TMP_REG1)
2601                 return SLJIT_SUCCESS;
2602
2603         if (GET_OPCODE(op) < SLJIT_ADD) {
2604                 compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2605                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2606         }
2607
2608 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2609                 || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2610         compiler->skip_checks = 1;
2611 #endif
2612         return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2613
2614 #else
2615         /* The SLJIT_CONFIG_X86_32 code path starts here. */
2616         if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2617                 if (reg_map[dst] <= 4) {
2618                         /* Low byte is accessible. */
2619                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2620                         FAIL_IF(!inst);
2621                         INC_SIZE(3 + 3);
2622                         /* Set low byte to conditional flag. */
2623                         *inst++ = GROUP_0F;
2624                         *inst++ = cond_set;
2625                         *inst++ = MOD_REG | reg_map[dst];
2626
2627                         *inst++ = GROUP_0F;
2628                         *inst++ = MOVZX_r_rm8;
2629                         *inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2630                         return SLJIT_SUCCESS;
2631                 }
2632
2633                 /* Low byte is not accessible. */
2634                 if (cpu_has_cmov == -1)
2635                         get_cpu_features();
2636
2637                 if (cpu_has_cmov) {
2638                         EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2639                         /* a xor reg, reg operation would overwrite the flags. */
2640                         EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2641
2642                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2643                         FAIL_IF(!inst);
2644                         INC_SIZE(3);
2645
2646                         *inst++ = GROUP_0F;
2647                         /* cmovcc = setcc - 0x50. */
2648                         *inst++ = cond_set - 0x50;
2649                         *inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2650                         return SLJIT_SUCCESS;
2651                 }
2652
2653                 inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2654                 FAIL_IF(!inst);
2655                 INC_SIZE(1 + 3 + 3 + 1);
2656                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2657                 /* Set al to conditional flag. */
2658                 *inst++ = GROUP_0F;
2659                 *inst++ = cond_set;
2660                 *inst++ = MOD_REG | 0 /* eax */;
2661
2662                 *inst++ = GROUP_0F;
2663                 *inst++ = MOVZX_r_rm8;
2664                 *inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2665                 *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2666                 return SLJIT_SUCCESS;
2667         }
2668
2669         if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && reg_map[dst] <= 4) {
2670                 SLJIT_ASSERT(reg_map[SLJIT_R0] == 0);
2671
2672                 if (dst != SLJIT_R0) {
2673                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2674                         FAIL_IF(!inst);
2675                         INC_SIZE(1 + 3 + 2 + 1);
2676                         /* Set low register to conditional flag. */
2677                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2678                         *inst++ = GROUP_0F;
2679                         *inst++ = cond_set;
2680                         *inst++ = MOD_REG | 0 /* eax */;
2681                         *inst++ = OR_rm8_r8;
2682                         *inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2683                         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2684                 }
2685                 else {
2686                         inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2687                         FAIL_IF(!inst);
2688                         INC_SIZE(2 + 3 + 2 + 2);
2689                         /* Set low register to conditional flag. */
2690                         *inst++ = XCHG_r_rm;
2691                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2692                         *inst++ = GROUP_0F;
2693                         *inst++ = cond_set;
2694                         *inst++ = MOD_REG | 1 /* ecx */;
2695                         *inst++ = OR_rm8_r8;
2696                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2697                         *inst++ = XCHG_r_rm;
2698                         *inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2699                 }
2700                 return SLJIT_SUCCESS;
2701         }
2702
2703         /* Set TMP_REG1 to the bit. */
2704         inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2705         FAIL_IF(!inst);
2706         INC_SIZE(1 + 3 + 3 + 1);
2707         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2708         /* Set al to conditional flag. */
2709         *inst++ = GROUP_0F;
2710         *inst++ = cond_set;
2711         *inst++ = MOD_REG | 0 /* eax */;
2712
2713         *inst++ = GROUP_0F;
2714         *inst++ = MOVZX_r_rm8;
2715         *inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2716
2717         *inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2718
2719         if (GET_OPCODE(op) < SLJIT_ADD)
2720                 return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2721
2722 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2723                 || (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2724         compiler->skip_checks = 1;
2725 #endif
2726         return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2727 #endif /* SLJIT_CONFIG_X86_64 */
2728 }
2729
2730 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_cmov(struct sljit_compiler *compiler, sljit_s32 type,
2731         sljit_s32 dst_reg,
2732         sljit_s32 src, sljit_sw srcw)
2733 {
2734         sljit_u8* inst;
2735
2736         CHECK_ERROR();
2737         CHECK(check_sljit_emit_cmov(compiler, type, dst_reg, src, srcw));
2738
2739 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2740         dst_reg &= ~SLJIT_I32_OP;
2741
2742         if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV) || (dst_reg >= SLJIT_R3 && dst_reg <= SLJIT_S3))
2743                 return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2744 #else
2745         if (!sljit_has_cpu_feature(SLJIT_HAS_CMOV))
2746                 return sljit_emit_cmov_generic(compiler, type, dst_reg, src, srcw);
2747 #endif
2748
2749         /* ADJUST_LOCAL_OFFSET is not needed. */
2750         CHECK_EXTRA_REGS(src, srcw, (void)0);
2751
2752 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2753         compiler->mode32 = dst_reg & SLJIT_I32_OP;
2754         dst_reg &= ~SLJIT_I32_OP;
2755 #endif
2756
2757         if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
2758                 EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
2759                 src = TMP_REG1;
2760                 srcw = 0;
2761         }
2762
2763         inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
2764         FAIL_IF(!inst);
2765         *inst++ = GROUP_0F;
2766         *inst = get_jump_code(type & 0xff) - 0x40;
2767         return SLJIT_SUCCESS;
2768 }
2769
2770 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2771 {
2772         CHECK_ERROR();
2773         CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2774         ADJUST_LOCAL_OFFSET(dst, dstw);
2775
2776         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2777
2778 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2779         compiler->mode32 = 0;
2780 #endif
2781
2782         ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2783
2784 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2785         if (NOT_HALFWORD(offset)) {
2786                 FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2787 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2788                 SLJIT_ASSERT(emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2789                 return compiler->error;
2790 #else
2791                 return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2792 #endif
2793         }
2794 #endif
2795
2796         if (offset != 0)
2797                 return emit_lea_binary(compiler, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2798         return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2799 }
2800
2801 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2802 {
2803         sljit_u8 *inst;
2804         struct sljit_const *const_;
2805 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2806         sljit_s32 reg;
2807 #endif
2808
2809         CHECK_ERROR_PTR();
2810         CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2811         ADJUST_LOCAL_OFFSET(dst, dstw);
2812
2813         CHECK_EXTRA_REGS(dst, dstw, (void)0);
2814
2815         const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2816         PTR_FAIL_IF(!const_);
2817         set_const(const_, compiler);
2818
2819 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2820         compiler->mode32 = 0;
2821         reg = FAST_IS_REG(dst) ? dst : TMP_REG1;
2822
2823         if (emit_load_imm64(compiler, reg, init_value))
2824                 return NULL;
2825 #else
2826         if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2827                 return NULL;
2828 #endif
2829
2830         inst = (sljit_u8*)ensure_buf(compiler, 2);
2831         PTR_FAIL_IF(!inst);
2832
2833         *inst++ = 0;
2834         *inst++ = 1;
2835
2836 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2837         if (dst & SLJIT_MEM)
2838                 if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2839                         return NULL;
2840 #endif
2841
2842         return const_;
2843 }
2844
2845 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_target, sljit_sw executable_offset)
2846 {
2847         SLJIT_UNUSED_ARG(executable_offset);
2848 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2849         sljit_unaligned_store_sw((void*)addr, new_target - (addr + 4) - (sljit_uw)executable_offset);
2850 #else
2851         sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_target);
2852 #endif
2853 }
2854
2855 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant, sljit_sw executable_offset)
2856 {
2857         SLJIT_UNUSED_ARG(executable_offset);
2858         sljit_unaligned_store_sw((void*)addr, new_constant);
2859 }