;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION_RODATA 32
low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
lea r14, [r1 - 2] ; tmpreg = (w-2);
and r14, -8 ; tmpreg &= (~7);
-.first_loop
+.first_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
movzx r8, byte [r7 + 1] ; int old_pix = src[1];
movzx r9, byte [r7] ; int old_sum = src[0];
add r9, r8 ; old_sum += old_pix
-.second_loop
+.second_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
cmp r6, r1 ; x < w
jl .second_loop
mov r5, 2 ; int y = 2;
-.height_loop
+.height_loop:
mov r10, r5; int tmpreg = y;
imul r10, r3; tmpreg *= stride;
lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
movzx r10, byte [r7] ; temp1 = src[0];
movzx r11, byte [r7 + 1] ; temp2 = src[1];
add r10, r11; temp1 += temp2
- movd xmm0, r10; __m128i old_pix_128 = temp2;
- movd xmm1, r11; __m128i old_sum_128 = temp1;
-.width_loop
+ movd xm0, r10d; __m128i old_pix_128 = temp2;
+ movd xm1, r11d; __m128i old_sum_128 = temp1;
+.width_loop:
movq xmm2, [r7 + r6]; __m128i new_pix = (src+x);
punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
movdqa xmm3, xmm2 ; __m128i temp = new_pix;
movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
add r9, r8
jmp .final_width_check
-.final_width_loop
+.final_width_loop:
movzx r10, byte [r7 + r6] ; temp1 = src[x];
lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
inc r6 ; x++
-.final_width_check
+.final_width_check:
cmp r6, r1 ; x < w
jl .final_width_loop
inc r5 ; y++;
lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
lea r14, [r1 - 2] ; tmpreg = (w-2);
and r14, -16 ; tmpreg &= (~15);
- vmovdqa ymm7, [low_word_zero wrt rip]
-.first_loop
+ vmovdqa ymm7, [low_word_zero]
+.first_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
movzx r8, byte [r7 + 1] ; int old_pix = src[1];
movzx r9, byte [r7] ; int old_sum = src[0];
add r9, r8 ; old_sum += old_pix
-.second_loop
+.second_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
cmp r6, r1 ; x < w
jl .second_loop
mov r5, 2 ; int y = 2;
-.height_loop
+.height_loop:
mov r10, r5; int tmpreg = y;
imul r10, r3; tmpreg *= stride;
lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
add r10, r11; temp1 += temp2
vmovd xmm0, r10d; __m128i old_pix_128 = temp2;
vmovd xmm1, r11d; __m128i old_sum_128 = temp1;
-.width_loop
+.width_loop:
vpermq ymm2, [r7 + r6], 0x10
vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
vpermq ymm8, ymm2, 0x4e
movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
add r9, r8
jmp .final_width_check
-.final_width_loop
+.final_width_loop:
movzx r10, byte [r7 + r6] ; temp1 = src[x];
lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
inc r6 ; x++
-.final_width_check
+.final_width_check:
cmp r6, r1 ; x < w
jl .final_width_loop
inc r5 ; y++;
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION_RODATA 32
add r6, mmsize
cmp r6, r7
jl .stride_loop ; still in scan line
- .stride_loop2
+ .stride_loop2:
cmp r6, r5
jge .finish
movzx r8, byte [r0 + r6]
mov byte [r0 + r6], r8b
inc r6
jmp .stride_loop2
- .finish
+ .finish:
add r0, r1
add r2, r3
cmp r2, r4
imul r7, r3
add r7, r2 ; last address
pxor xmm2, xmm2
- movdqa xmm3, [words_255 wrt rip]
+ movdqa xmm3, [words_255]
mov r9, r6
and r9, -8 ; &= (~8);
.height_loop:
add r8, 8
cmp r8, r9
jl .stride_loop ; still in scan line
-.stride_loop2
+.stride_loop2:
cmp r8, r6
jge .finish
movzx r10, byte [r2 + r8]
imul r7, r3
add r7, r2 ; last address
vpxor ymm2, ymm2
- vmovdqa ymm3, [words_255 wrt rip]
+ vmovdqa ymm3, [words_255]
mov r9, r6
and r9, -16 ; &= (~16);
.height_loop:
add r8, 16
cmp r8, r9
jl .stride_loop ; still in scan line
-.stride_loop2
+.stride_loop2:
cmp r8, r6
jge .finish
movzx r10, byte [r2 + r8]
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "utils.asm"
+%include "x86/utils.asm"
SECTION_RODATA 32
mova m2, [words_one]
jmp .row_loop
-.col_loop
+.col_loop:
mova m1, [r1]
%if mmsize == 32
vpermq m1, m1, q3120
mova [r0 + r5], m1
add r5, r4
add r1, mmsize
-.row_loop
+.row_loop:
cmp r5, r3
jl .col_loop
sub r5, r4
psrlw m0, 1
mova [r0 + r5], m0
-.skip_odd
+.skip_odd:
add r5, mmsize
sub r5, r3
add r1, r2
sub r5, r6
jmp .row_loop
-.col_loop
+.col_loop:
mova m0, [r2]
mova m2, m0
psrlw m2, 8
jb .col_loop
add r0, r5
add r2, r4
-.row_loop
+.row_loop:
mova m3, [words_dither0]
mova m4, [words_dither1]
lea r6, [r2 + r4]
jb .odd_stripe
RET
-.odd_stripe
+.odd_stripe:
mova m0, [r2]
mova m2, m0
psrlw m2, 8
%endif
lea r5, [r0 + r3]
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -4 * mmsize
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
-.row_loop
+.row_loop:
LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5
%if ARCH_X86_64 == 0
PUSH t0
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
jb .odd_stripe
RET
-.odd_stripe
+.odd_stripe:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * mmsize
pxor m0, m0
pxor m1, m1
-.row_loop
+.row_loop:
LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
paddw m3, m0, m2
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * mmsize
pxor m0, m0
pxor m1, m1
-.row_loop
+.row_loop:
LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
paddw m0, m2
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -4 * mmsize
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
-.row_loop
+.row_loop:
LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
%if ARCH_X86_64
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -6 * mmsize
-.row_loop
+.row_loop:
mova m6, m4
mova m7, m4
LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
%if %%i4 > 4
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * %%i4 * mmsize
-.row_loop
+.row_loop:
mova m6, m8
mova m7, m8
LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION .text
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "utils.asm"
+%include "x86/utils.asm"
SECTION_RODATA 32
mov r2d, (1 << %1)
jmp .loop_entry
-.loop_start
+.loop_start:
add r0, r1
%if ARCH_X86_64 || a_shift == 0
psubw m1, m8
BCASTW 7, r3d
psubw m1, m7
%endif
-.loop_entry
+.loop_entry:
%assign i 0
%rep (1 << %1) / mmsize
%if i
%define dn_pos [rstk + delta_offs + 2 * tile_size + 8]
%endif
-.line_loop
+.line_loop:
%if ARCH_X86_64 == 0
mov t3, r2m
lea t0, [t3 + line_size]
jmp .bulk_fill
%endif
-.generic_fist
+.generic_fist:
%if ARCH_X86_64 == 0
mov t5, dn_addr
%if a_shift
%endif
%endif
-.bulk_fill
+.bulk_fill:
mov t2d, 1 << (13 - %1)
mov t0d, t9d ; b
sar t0d, 1
mova mm_full, [words_tile%2]
%endif
-.internal_loop
+.internal_loop:
%assign i 0
%rep (2 << %1) / mmsize
%if i
psubw mm_c, m0
%endif
-.end_loop
+.end_loop:
%if ARCH_X86_64
test t7d, t7d
jz .end_line_loop
jmp .last_line
%endif
-.single_line
+.single_line:
%if ARCH_X86_64 == 0
mov t7d, dn_pos
%endif
mov t2d, t7d
sub t2d, t6d ; dn_pos - up_pos
add t6d, t7d ; dn_pos + up_pos
-.last_line
+.last_line:
FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
-.end_line_loop
+.end_line_loop:
%if ARCH_X86_64
add r2, line_size
sub r3, 1
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%define PIC
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
;------------------------------------------------------------------------------
; MUL 1:reg, 2:num