1
;
2
; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
;
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
9
;
10
11
%include "third_party/x86inc/x86inc.asm"
12
13
SECTION_RODATA
14
pw_8: times 8 dw 8
15
bilin_filter_m_sse2: times 8 dw 16
16
times 8 dw 0
17
times 8 dw 14
18
times 8 dw 2
19
times 8 dw 12
20
times 8 dw 4
21
times 8 dw 10
22
times 8 dw 6
23
times 16 dw 8
24
times 8 dw 6
25
times 8 dw 10
26
times 8 dw 4
27
times 8 dw 12
28
times 8 dw 2
29
times 8 dw 14
30
31
SECTION .text
32
33
; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
34
; int x_offset, int y_offset,
35
; const uint8_t *ref, ptrdiff_t ref_stride,
36
; int height, unsigned int *sse);
37
;
38
; This function returns the SE and stores SSE in the given pointer.
39
40
%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
41
psubw %3, %4
42
psubw %1, %2
43
mova %4, %3 ; make copies to manipulate to calc sum
44
mova %2, %1 ; use originals for calc sse
45
pmaddwd %3, %3
46
paddw %4, %2
47
pmaddwd %1, %1
48
movhlps %2, %4
49
paddd %6, %3
50
paddw %4, %2
51
pxor %2, %2
52
pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
53
punpcklwd %4, %2 ; sign-extend word to dword
54
paddd %6, %1
55
paddd %5, %4
56
57
%endmacro
58
59
%macro STORE_AND_RET 0
60
%if mmsize == 16
61
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
62
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
63
; We have to sign-extend it before adding the words within the register
64
; and outputing to a dword.
65
movhlps m3, m7
66
movhlps m4, m6
67
paddd m7, m3
68
paddd m6, m4
69
pshufd m3, m7, 0x1
70
pshufd m4, m6, 0x1
71
paddd m7, m3
72
paddd m6, m4
73
mov r1, ssem ; r1 = unsigned int *sse
74
movd [r1], m7 ; store sse
75
movd eax, m6 ; store sum as return value
76
%endif
77
RET
78
%endmacro
79
80
%macro INC_SRC_BY_SRC_STRIDE 0
81
%if ARCH_X86=1 && CONFIG_PIC=1
82
add srcq, src_stridemp
83
add srcq, src_stridemp
84
%else
85
lea srcq, [srcq + src_strideq*2]
86
%endif
87
%endmacro
88
89
%macro SUBPEL_VARIANCE 1-2 0 ; W
90
%define bilin_filter_m bilin_filter_m_sse2
91
%define filter_idx_shift 5
92
93
94
%if ARCH_X86_64
95
%if %2 == 1 ; avg
96
cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
97
x_offset, y_offset, \
98
ref, ref_stride, \
99
second_pred, second_stride, height, sse
100
%define second_str second_strideq
101
%else
102
cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
103
x_offset, y_offset, \
104
ref, ref_stride, height, sse
105
%endif
106
%define block_height heightd
107
%define bilin_filter sseq
108
%else
109
%if CONFIG_PIC=1
110
%if %2 == 1 ; avg
111
cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
112
x_offset, y_offset, \
113
ref, ref_stride, \
114
second_pred, second_stride, height, sse
115
%define block_height dword heightm
116
%define second_str second_stridemp
117
%else
118
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
119
x_offset, y_offset, \
120
ref, ref_stride, height, sse
121
%define block_height heightd
122
%endif
123
124
; reuse argument stack space
125
%define g_bilin_filterm x_offsetm
126
%define g_pw_8m y_offsetm
127
128
; Store bilin_filter and pw_8 location in stack
129
%if GET_GOT_DEFINED == 1
130
GET_GOT eax
131
add esp, 4 ; restore esp
132
%endif
133
134
lea ecx, [GLOBAL(bilin_filter_m)]
135
mov g_bilin_filterm, ecx
136
137
lea ecx, [GLOBAL(pw_8)]
138
mov g_pw_8m, ecx
139
140
LOAD_IF_USED 0, 1 ; load eax, ecx back
141
%else
142
%if %2 == 1 ; avg
143
cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
144
x_offset, y_offset, \
145
ref, ref_stride, \
146
second_pred, second_stride, height, sse
147
%define block_height dword heightm
148
%define second_str second_stridemp
149
%else
150
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
151
x_offset, y_offset, \
152
ref, ref_stride, height, sse
153
%define block_height heightd
154
%endif
155
156
%define bilin_filter bilin_filter_m
157
%endif
158
%endif
159
160
ASSERT %1 <= 16 ; m6 overflows if w > 16
161
pxor m6, m6 ; sum
162
pxor m7, m7 ; sse
163
164
%if %1 < 16
165
sar block_height, 1
166
%endif
167
%if %2 == 1 ; avg
168
shl second_str, 1
169
%endif
170
171
; FIXME(rbultje) replace by jumptable?
172
test x_offsetd, x_offsetd
173
jnz .x_nonzero
174
; x_offset == 0
175
test y_offsetd, y_offsetd
176
jnz .x_zero_y_nonzero
177
178
; x_offset == 0 && y_offset == 0
179
.x_zero_y_zero_loop:
180
%if %1 == 16
181
movu m0, [srcq]
182
movu m2, [srcq + 16]
183
mova m1, [refq]
184
mova m3, [refq + 16]
185
%if %2 == 1 ; avg
186
pavgw m0, [second_predq]
187
pavgw m2, [second_predq+16]
188
%endif
189
SUM_SSE m0, m1, m2, m3, m6, m7
190
191
lea srcq, [srcq + src_strideq*2]
192
lea refq, [refq + ref_strideq*2]
193
%if %2 == 1 ; avg
194
add second_predq, second_str
195
%endif
196
%else ; %1 < 16
197
movu m0, [srcq]
198
movu m2, [srcq + src_strideq*2]
199
mova m1, [refq]
200
mova m3, [refq + ref_strideq*2]
201
%if %2 == 1 ; avg
202
pavgw m0, [second_predq]
203
add second_predq, second_str
204
pavgw m2, [second_predq]
205
%endif
206
SUM_SSE m0, m1, m2, m3, m6, m7
207
208
lea srcq, [srcq + src_strideq*4]
209
lea refq, [refq + ref_strideq*4]
210
%if %2 == 1 ; avg
211
add second_predq, second_str
212
%endif
213
%endif
214
dec block_height
215
jg .x_zero_y_zero_loop
216
STORE_AND_RET
217
218
.x_zero_y_nonzero:
219
cmp y_offsetd, 8
220
jne .x_zero_y_nonhalf
221
222
; x_offset == 0 && y_offset == 0.5
223
.x_zero_y_half_loop:
224
%if %1 == 16
225
movu m0, [srcq]
226
movu m1, [srcq+16]
227
movu m4, [srcq+src_strideq*2]
228
movu m5, [srcq+src_strideq*2+16]
229
mova m2, [refq]
230
mova m3, [refq+16]
231
pavgw m0, m4
232
pavgw m1, m5
233
%if %2 == 1 ; avg
234
pavgw m0, [second_predq]
235
pavgw m1, [second_predq+16]
236
%endif
237
SUM_SSE m0, m2, m1, m3, m6, m7
238
239
lea srcq, [srcq + src_strideq*2]
240
lea refq, [refq + ref_strideq*2]
241
%if %2 == 1 ; avg
242
add second_predq, second_str
243
%endif
244
%else ; %1 < 16
245
movu m0, [srcq]
246
movu m1, [srcq+src_strideq*2]
247
movu m5, [srcq+src_strideq*4]
248
mova m2, [refq]
249
mova m3, [refq+ref_strideq*2]
250
pavgw m0, m1
251
pavgw m1, m5
252
%if %2 == 1 ; avg
253
pavgw m0, [second_predq]
254
add second_predq, second_str
255
See Also[PDF] SERVICE MANUAL - Free Download PDFarch/arm64/boot/dts/rockchip/rk3399-firefly.dts - master | Gitversepavgw m1, [second_predq]
256
%endif
257
SUM_SSE m0, m2, m1, m3, m6, m7
258
259
lea srcq, [srcq + src_strideq*4]
260
lea refq, [refq + ref_strideq*4]
261
%if %2 == 1 ; avg
262
add second_predq, second_str
263
%endif
264
%endif
265
dec block_height
266
jg .x_zero_y_half_loop
267
STORE_AND_RET
268
269
.x_zero_y_nonhalf:
270
; x_offset == 0 && y_offset == bilin interpolation
271
%if ARCH_X86_64
272
lea bilin_filter, [GLOBAL(bilin_filter_m)]
273
%endif
274
shl y_offsetd, filter_idx_shift
275
%if ARCH_X86_64 && mmsize == 16
276
mova m8, [bilin_filter+y_offsetq]
277
mova m9, [bilin_filter+y_offsetq+16]
278
mova m10, [GLOBAL(pw_8)]
279
%define filter_y_a m8
280
%define filter_y_b m9
281
%define filter_rnd m10
282
%else ; x86-32 or mmx
283
%if ARCH_X86=1 && CONFIG_PIC=1
284
; x_offset == 0, reuse x_offset reg
285
%define tempq x_offsetq
286
add y_offsetq, g_bilin_filterm
287
%define filter_y_a [y_offsetq]
288
%define filter_y_b [y_offsetq+16]
289
mov tempq, g_pw_8m
290
%define filter_rnd [tempq]
291
%else
292
add y_offsetq, bilin_filter
293
%define filter_y_a [y_offsetq]
294
%define filter_y_b [y_offsetq+16]
295
%define filter_rnd [GLOBAL(pw_8)]
296
%endif
297
%endif
298
299
.x_zero_y_other_loop:
300
%if %1 == 16
301
movu m0, [srcq]
302
movu m1, [srcq + 16]
303
movu m4, [srcq+src_strideq*2]
304
movu m5, [srcq+src_strideq*2+16]
305
mova m2, [refq]
306
mova m3, [refq+16]
307
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
308
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
309
; instructions is the same (5), but it is 1 mul instead of 2, so might be
310
; slightly faster because of pmullw latency. It would also cut our rodata
311
; tables in half for this function, and save 1-2 registers on x86-64.
312
pmullw m1, filter_y_a
313
pmullw m5, filter_y_b
314
paddw m1, filter_rnd
315
pmullw m0, filter_y_a
316
pmullw m4, filter_y_b
317
paddw m0, filter_rnd
318
paddw m1, m5
319
paddw m0, m4
320
psrlw m1, 4
321
psrlw m0, 4
322
%if %2 == 1 ; avg
323
pavgw m0, [second_predq]
324
pavgw m1, [second_predq+16]
325
%endif
326
SUM_SSE m0, m2, m1, m3, m6, m7
327
328
lea srcq, [srcq + src_strideq*2]
329
lea refq, [refq + ref_strideq*2]
330
%if %2 == 1 ; avg
331
add second_predq, second_str
332
%endif
333
%else ; %1 < 16
334
movu m0, [srcq]
335
movu m1, [srcq+src_strideq*2]
336
movu m5, [srcq+src_strideq*4]
337
mova m4, m1
338
mova m2, [refq]
339
mova m3, [refq+ref_strideq*2]
340
pmullw m1, filter_y_a
341
pmullw m5, filter_y_b
342
paddw m1, filter_rnd
343
pmullw m0, filter_y_a
344
pmullw m4, filter_y_b
345
paddw m0, filter_rnd
346
paddw m1, m5
347
paddw m0, m4
348
psrlw m1, 4
349
psrlw m0, 4
350
%if %2 == 1 ; avg
351
pavgw m0, [second_predq]
352
add second_predq, second_str
353
pavgw m1, [second_predq]
354
%endif
355
SUM_SSE m0, m2, m1, m3, m6, m7
356
357
lea srcq, [srcq + src_strideq*4]
358
lea refq, [refq + ref_strideq*4]
359
%if %2 == 1 ; avg
360
add second_predq, second_str
361
%endif
362
%endif
363
dec block_height
364
jg .x_zero_y_other_loop
365
%undef filter_y_a
366
%undef filter_y_b
367
%undef filter_rnd
368
STORE_AND_RET
369
370
.x_nonzero:
371
cmp x_offsetd, 8
372
jne .x_nonhalf
373
; x_offset == 0.5
374
test y_offsetd, y_offsetd
375
jnz .x_half_y_nonzero
376
377
; x_offset == 0.5 && y_offset == 0
378
.x_half_y_zero_loop:
379
%if %1 == 16
380
movu m0, [srcq]
381
movu m1, [srcq + 16]
382
movu m4, [srcq + 2]
383
movu m5, [srcq + 18]
384
mova m2, [refq]
385
mova m3, [refq + 16]
386
pavgw m0, m4
387
pavgw m1, m5
388
%if %2 == 1 ; avg
389
pavgw m0, [second_predq]
390
pavgw m1, [second_predq+16]
391
%endif
392
SUM_SSE m0, m2, m1, m3, m6, m7
393
394
lea srcq, [srcq + src_strideq*2]
395
lea refq, [refq + ref_strideq*2]
396
%if %2 == 1 ; avg
397
add second_predq, second_str
398
%endif
399
%else ; %1 < 16
400
movu m0, [srcq]
401
movu m1, [srcq + src_strideq*2]
402
movu m4, [srcq + 2]
403
movu m5, [srcq + src_strideq*2 + 2]
404
mova m2, [refq]
405
mova m3, [refq + ref_strideq*2]
406
pavgw m0, m4
407
pavgw m1, m5
408
%if %2 == 1 ; avg
409
pavgw m0, [second_predq]
410
add second_predq, second_str
411
pavgw m1, [second_predq]
412
%endif
413
SUM_SSE m0, m2, m1, m3, m6, m7
414
415
lea srcq, [srcq + src_strideq*4]
416
lea refq, [refq + ref_strideq*4]
417
%if %2 == 1 ; avg
418
add second_predq, second_str
419
%endif
420
%endif
421
dec block_height
422
jg .x_half_y_zero_loop
423
STORE_AND_RET
424
425
.x_half_y_nonzero:
426
cmp y_offsetd, 8
427
jne .x_half_y_nonhalf
428
429
; x_offset == 0.5 && y_offset == 0.5
430
%if %1 == 16
431
movu m0, [srcq]
432
movu m1, [srcq+16]
433
movu m2, [srcq+2]
434
movu m3, [srcq+18]
435
lea srcq, [srcq + src_strideq*2]
436
pavgw m0, m2
437
pavgw m1, m3
438
.x_half_y_half_loop:
439
movu m2, [srcq]
440
movu m3, [srcq + 16]
441
movu m4, [srcq + 2]
442
movu m5, [srcq + 18]
443
pavgw m2, m4
444
pavgw m3, m5
445
pavgw m0, m2
446
pavgw m1, m3
447
mova m4, [refq]
448
mova m5, [refq + 16]
449
%if %2 == 1 ; avg
450
pavgw m0, [second_predq]
451
pavgw m1, [second_predq+16]
452
%endif
453
SUM_SSE m0, m4, m1, m5, m6, m7
454
mova m0, m2
455
mova m1, m3
456
457
lea srcq, [srcq + src_strideq*2]
458
lea refq, [refq + ref_strideq*2]
459
%if %2 == 1 ; avg
460
add second_predq, second_str
461
%endif
462
%else ; %1 < 16
463
movu m0, [srcq]
464
movu m2, [srcq+2]
465
lea srcq, [srcq + src_strideq*2]
466
pavgw m0, m2
467
.x_half_y_half_loop:
468
movu m2, [srcq]
469
movu m3, [srcq + src_strideq*2]
470
movu m4, [srcq + 2]
471
movu m5, [srcq + src_strideq*2 + 2]
472
pavgw m2, m4
473
pavgw m3, m5
474
pavgw m0, m2
475
pavgw m2, m3
476
mova m4, [refq]
477
mova m5, [refq + ref_strideq*2]
478
%if %2 == 1 ; avg
479
pavgw m0, [second_predq]
480
add second_predq, second_str
481
pavgw m2, [second_predq]
482
%endif
483
SUM_SSE m0, m4, m2, m5, m6, m7
484
mova m0, m3
485
486
lea srcq, [srcq + src_strideq*4]
487
lea refq, [refq + ref_strideq*4]
488
%if %2 == 1 ; avg
489
add second_predq, second_str
490
%endif
491
%endif
492
dec block_height
493
jg .x_half_y_half_loop
494
STORE_AND_RET
495
496
.x_half_y_nonhalf:
497
; x_offset == 0.5 && y_offset == bilin interpolation
498
%if ARCH_X86_64
499
lea bilin_filter, [GLOBAL(bilin_filter_m)]
500
%endif
501
shl y_offsetd, filter_idx_shift
502
%if ARCH_X86_64 && mmsize == 16
503
mova m8, [bilin_filter+y_offsetq]
504
mova m9, [bilin_filter+y_offsetq+16]
505
mova m10, [GLOBAL(pw_8)]
506
%define filter_y_a m8
507
%define filter_y_b m9
508
%define filter_rnd m10
509
%else ; x86_32
510
%if ARCH_X86=1 && CONFIG_PIC=1
511
; x_offset == 0.5. We can reuse x_offset reg
512
%define tempq x_offsetq
513
add y_offsetq, g_bilin_filterm
514
%define filter_y_a [y_offsetq]
515
%define filter_y_b [y_offsetq+16]
516
mov tempq, g_pw_8m
517
%define filter_rnd [tempq]
518
%else
519
add y_offsetq, bilin_filter
520
%define filter_y_a [y_offsetq]
521
%define filter_y_b [y_offsetq+16]
522
%define filter_rnd [GLOBAL(pw_8)]
523
%endif
524
%endif
525
526
%if %1 == 16
527
movu m0, [srcq]
528
movu m1, [srcq+16]
529
movu m2, [srcq+2]
530
movu m3, [srcq+18]
531
lea srcq, [srcq + src_strideq*2]
532
pavgw m0, m2
533
pavgw m1, m3
534
.x_half_y_other_loop:
535
movu m2, [srcq]
536
movu m3, [srcq+16]
537
movu m4, [srcq+2]
538
movu m5, [srcq+18]
539
pavgw m2, m4
540
pavgw m3, m5
541
mova m4, m2
542
mova m5, m3
543
pmullw m1, filter_y_a
544
pmullw m3, filter_y_b
545
paddw m1, filter_rnd
546
paddw m1, m3
547
pmullw m0, filter_y_a
548
pmullw m2, filter_y_b
549
paddw m0, filter_rnd
550
psrlw m1, 4
551
paddw m0, m2
552
mova m2, [refq]
553
psrlw m0, 4
554
mova m3, [refq+16]
555
%if %2 == 1 ; avg
556
pavgw m0, [second_predq]
557
pavgw m1, [second_predq+16]
558
%endif
559
SUM_SSE m0, m2, m1, m3, m6, m7
560
mova m0, m4
561
mova m1, m5
562
563
lea srcq, [srcq + src_strideq*2]
564
lea refq, [refq + ref_strideq*2]
565
%if %2 == 1 ; avg
566
add second_predq, second_str
567
%endif
568
%else ; %1 < 16
569
movu m0, [srcq]
570
movu m2, [srcq+2]
571
lea srcq, [srcq + src_strideq*2]
572
pavgw m0, m2
573
.x_half_y_other_loop:
574
movu m2, [srcq]
575
movu m3, [srcq+src_strideq*2]
576
movu m4, [srcq+2]
577
movu m5, [srcq+src_strideq*2+2]
578
pavgw m2, m4
579
pavgw m3, m5
580
mova m4, m2
581
mova m5, m3
582
pmullw m4, filter_y_a
583
pmullw m3, filter_y_b
584
paddw m4, filter_rnd
585
paddw m4, m3
586
pmullw m0, filter_y_a
587
pmullw m2, filter_y_b
588
paddw m0, filter_rnd
589
psrlw m4, 4
590
paddw m0, m2
591
mova m2, [refq]
592
psrlw m0, 4
593
mova m3, [refq+ref_strideq*2]
594
%if %2 == 1 ; avg
595
pavgw m0, [second_predq]
596
add second_predq, second_str
597
pavgw m4, [second_predq]
598
%endif
599
SUM_SSE m0, m2, m4, m3, m6, m7
600
mova m0, m5
601
602
lea srcq, [srcq + src_strideq*4]
603
lea refq, [refq + ref_strideq*4]
604
%if %2 == 1 ; avg
605
add second_predq, second_str
606
%endif
607
%endif
608
dec block_height
609
jg .x_half_y_other_loop
610
%undef filter_y_a
611
%undef filter_y_b
612
%undef filter_rnd
613
STORE_AND_RET
614
615
.x_nonhalf:
616
test y_offsetd, y_offsetd
617
jnz .x_nonhalf_y_nonzero
618
619
; x_offset == bilin interpolation && y_offset == 0
620
%if ARCH_X86_64
621
lea bilin_filter, [GLOBAL(bilin_filter_m)]
622
%endif
623
shl x_offsetd, filter_idx_shift
624
%if ARCH_X86_64 && mmsize == 16
625
mova m8, [bilin_filter+x_offsetq]
626
mova m9, [bilin_filter+x_offsetq+16]
627
mova m10, [GLOBAL(pw_8)]
628
%define filter_x_a m8
629
%define filter_x_b m9
630
%define filter_rnd m10
631
%else ; x86-32
632
%if ARCH_X86=1 && CONFIG_PIC=1
633
; y_offset == 0. We can reuse y_offset reg.
634
%define tempq y_offsetq
635
add x_offsetq, g_bilin_filterm
636
%define filter_x_a [x_offsetq]
637
%define filter_x_b [x_offsetq+16]
638
mov tempq, g_pw_8m
639
%define filter_rnd [tempq]
640
%else
641
add x_offsetq, bilin_filter
642
%define filter_x_a [x_offsetq]
643
%define filter_x_b [x_offsetq+16]
644
%define filter_rnd [GLOBAL(pw_8)]
645
%endif
646
%endif
647
648
.x_other_y_zero_loop:
649
%if %1 == 16
650
movu m0, [srcq]
651
movu m1, [srcq+16]
652
movu m2, [srcq+2]
653
movu m3, [srcq+18]
654
mova m4, [refq]
655
mova m5, [refq+16]
656
pmullw m1, filter_x_a
657
pmullw m3, filter_x_b
658
paddw m1, filter_rnd
659
pmullw m0, filter_x_a
660
pmullw m2, filter_x_b
661
paddw m0, filter_rnd
662
paddw m1, m3
663
paddw m0, m2
664
psrlw m1, 4
665
psrlw m0, 4
666
%if %2 == 1 ; avg
667
pavgw m0, [second_predq]
668
pavgw m1, [second_predq+16]
669
%endif
670
SUM_SSE m0, m4, m1, m5, m6, m7
671
672
lea srcq, [srcq+src_strideq*2]
673
lea refq, [refq+ref_strideq*2]
674
%if %2 == 1 ; avg
675
add second_predq, second_str
676
%endif
677
%else ; %1 < 16
678
movu m0, [srcq]
679
movu m1, [srcq+src_strideq*2]
680
movu m2, [srcq+2]
681
movu m3, [srcq+src_strideq*2+2]
682
mova m4, [refq]
683
mova m5, [refq+ref_strideq*2]
684
pmullw m1, filter_x_a
685
pmullw m3, filter_x_b
686
paddw m1, filter_rnd
687
pmullw m0, filter_x_a
688
pmullw m2, filter_x_b
689
paddw m0, filter_rnd
690
paddw m1, m3
691
paddw m0, m2
692
psrlw m1, 4
693
psrlw m0, 4
694
%if %2 == 1 ; avg
695
pavgw m0, [second_predq]
696
add second_predq, second_str
697
pavgw m1, [second_predq]
698
%endif
699
SUM_SSE m0, m4, m1, m5, m6, m7
700
701
lea srcq, [srcq+src_strideq*4]
702
lea refq, [refq+ref_strideq*4]
703
%if %2 == 1 ; avg
704
add second_predq, second_str
705
%endif
706
%endif
707
dec block_height
708
jg .x_other_y_zero_loop
709
%undef filter_x_a
710
%undef filter_x_b
711
%undef filter_rnd
712
STORE_AND_RET
713
714
.x_nonhalf_y_nonzero:
715
cmp y_offsetd, 8
716
jne .x_nonhalf_y_nonhalf
717
718
; x_offset == bilin interpolation && y_offset == 0.5
719
%if ARCH_X86_64
720
lea bilin_filter, [GLOBAL(bilin_filter_m)]
721
%endif
722
shl x_offsetd, filter_idx_shift
723
%if ARCH_X86_64 && mmsize == 16
724
mova m8, [bilin_filter+x_offsetq]
725
mova m9, [bilin_filter+x_offsetq+16]
726
mova m10, [GLOBAL(pw_8)]
727
%define filter_x_a m8
728
%define filter_x_b m9
729
%define filter_rnd m10
730
%else ; x86-32
731
%if ARCH_X86=1 && CONFIG_PIC=1
732
; y_offset == 0.5. We can reuse y_offset reg.
733
%define tempq y_offsetq
734
add x_offsetq, g_bilin_filterm
735
%define filter_x_a [x_offsetq]
736
%define filter_x_b [x_offsetq+16]
737
mov tempq, g_pw_8m
738
%define filter_rnd [tempq]
739
%else
740
add x_offsetq, bilin_filter
741
%define filter_x_a [x_offsetq]
742
%define filter_x_b [x_offsetq+16]
743
%define filter_rnd [GLOBAL(pw_8)]
744
%endif
745
%endif
746
747
%if %1 == 16
748
movu m0, [srcq]
749
movu m1, [srcq+16]
750
movu m2, [srcq+2]
751
movu m3, [srcq+18]
752
pmullw m0, filter_x_a
753
pmullw m2, filter_x_b
754
paddw m0, filter_rnd
755
pmullw m1, filter_x_a
756
pmullw m3, filter_x_b
757
paddw m1, filter_rnd
758
paddw m0, m2
759
paddw m1, m3
760
psrlw m0, 4
761
psrlw m1, 4
762
lea srcq, [srcq+src_strideq*2]
763
.x_other_y_half_loop:
764
movu m2, [srcq]
765
movu m3, [srcq+16]
766
movu m4, [srcq+2]
767
movu m5, [srcq+18]
768
pmullw m2, filter_x_a
769
pmullw m4, filter_x_b
770
paddw m2, filter_rnd
771
pmullw m3, filter_x_a
772
pmullw m5, filter_x_b
773
paddw m3, filter_rnd
774
paddw m2, m4
775
paddw m3, m5
776
mova m4, [refq]
777
mova m5, [refq+16]
778
psrlw m2, 4
779
psrlw m3, 4
780
pavgw m0, m2
781
pavgw m1, m3
782
%if %2 == 1 ; avg
783
pavgw m0, [second_predq]
784
pavgw m1, [second_predq+16]
785
%endif
786
SUM_SSE m0, m4, m1, m5, m6, m7
787
mova m0, m2
788
mova m1, m3
789
790
lea srcq, [srcq+src_strideq*2]
791
lea refq, [refq+ref_strideq*2]
792
%if %2 == 1 ; avg
793
add second_predq, second_str
794
%endif
795
%else ; %1 < 16
796
movu m0, [srcq]
797
movu m2, [srcq+2]
798
pmullw m0, filter_x_a
799
pmullw m2, filter_x_b
800
paddw m0, filter_rnd
801
paddw m0, m2
802
psrlw m0, 4
803
lea srcq, [srcq+src_strideq*2]
804
.x_other_y_half_loop:
805
movu m2, [srcq]
806
movu m3, [srcq+src_strideq*2]
807
movu m4, [srcq+2]
808
movu m5, [srcq+src_strideq*2+2]
809
pmullw m2, filter_x_a
810
pmullw m4, filter_x_b
811
paddw m2, filter_rnd
812
pmullw m3, filter_x_a
813
pmullw m5, filter_x_b
814
paddw m3, filter_rnd
815
paddw m2, m4
816
paddw m3, m5
817
mova m4, [refq]
818
mova m5, [refq+ref_strideq*2]
819
psrlw m2, 4
820
psrlw m3, 4
821
pavgw m0, m2
822
pavgw m2, m3
823
%if %2 == 1 ; avg
824
pavgw m0, [second_predq]
825
add second_predq, second_str
826
pavgw m2, [second_predq]
827
%endif
828
SUM_SSE m0, m4, m2, m5, m6, m7
829
mova m0, m3
830
831
lea srcq, [srcq+src_strideq*4]
832
lea refq, [refq+ref_strideq*4]
833
%if %2 == 1 ; avg
834
add second_predq, second_str
835
%endif
836
%endif
837
dec block_height
838
jg .x_other_y_half_loop
839
%undef filter_x_a
840
%undef filter_x_b
841
%undef filter_rnd
842
STORE_AND_RET
843
844
.x_nonhalf_y_nonhalf:
845
; loading filter - this is same as in 8-bit depth
846
%if ARCH_X86_64
847
lea bilin_filter, [GLOBAL(bilin_filter_m)]
848
%endif
849
shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
850
shl y_offsetd, filter_idx_shift
851
%if ARCH_X86_64 && mmsize == 16
852
mova m8, [bilin_filter+x_offsetq]
853
mova m9, [bilin_filter+x_offsetq+16]
854
mova m10, [bilin_filter+y_offsetq]
855
mova m11, [bilin_filter+y_offsetq+16]
856
mova m12, [GLOBAL(pw_8)]
857
%define filter_x_a m8
858
%define filter_x_b m9
859
%define filter_y_a m10
860
%define filter_y_b m11
861
%define filter_rnd m12
862
%else ; x86-32
863
%if ARCH_X86=1 && CONFIG_PIC=1
864
; In this case, there is NO unused register. Used src_stride register. Later,
865
; src_stride has to be loaded from stack when it is needed.
866
%define tempq src_strideq
867
mov tempq, g_bilin_filterm
868
add x_offsetq, tempq
869
add y_offsetq, tempq
870
%define filter_x_a [x_offsetq]
871
%define filter_x_b [x_offsetq+16]
872
%define filter_y_a [y_offsetq]
873
%define filter_y_b [y_offsetq+16]
874
875
mov tempq, g_pw_8m
876
%define filter_rnd [tempq]
877
%else
878
add x_offsetq, bilin_filter
879
add y_offsetq, bilin_filter
880
%define filter_x_a [x_offsetq]
881
%define filter_x_b [x_offsetq+16]
882
%define filter_y_a [y_offsetq]
883
%define filter_y_b [y_offsetq+16]
884
%define filter_rnd [GLOBAL(pw_8)]
885
%endif
886
%endif
887
; end of load filter
888
889
; x_offset == bilin interpolation && y_offset == bilin interpolation
890
%if %1 == 16
891
movu m0, [srcq]
892
movu m2, [srcq+2]
893
movu m1, [srcq+16]
894
movu m3, [srcq+18]
895
pmullw m0, filter_x_a
896
pmullw m2, filter_x_b
897
paddw m0, filter_rnd
898
pmullw m1, filter_x_a
899
pmullw m3, filter_x_b
900
paddw m1, filter_rnd
901
paddw m0, m2
902
paddw m1, m3
903
psrlw m0, 4
904
psrlw m1, 4
905
906
INC_SRC_BY_SRC_STRIDE
907
908
.x_other_y_other_loop:
909
movu m2, [srcq]
910
movu m4, [srcq+2]
911
movu m3, [srcq+16]
912
movu m5, [srcq+18]
913
pmullw m2, filter_x_a
914
pmullw m4, filter_x_b
915
paddw m2, filter_rnd
916
pmullw m3, filter_x_a
917
pmullw m5, filter_x_b
918
paddw m3, filter_rnd
919
paddw m2, m4
920
paddw m3, m5
921
psrlw m2, 4
922
psrlw m3, 4
923
mova m4, m2
924
mova m5, m3
925
pmullw m0, filter_y_a
926
pmullw m2, filter_y_b
927
paddw m0, filter_rnd
928
pmullw m1, filter_y_a
929
pmullw m3, filter_y_b
930
paddw m0, m2
931
paddw m1, filter_rnd
932
mova m2, [refq]
933
paddw m1, m3
934
psrlw m0, 4
935
psrlw m1, 4
936
mova m3, [refq+16]
937
%if %2 == 1 ; avg
938
pavgw m0, [second_predq]
939
pavgw m1, [second_predq+16]
940
%endif
941
SUM_SSE m0, m2, m1, m3, m6, m7
942
mova m0, m4
943
mova m1, m5
944
945
INC_SRC_BY_SRC_STRIDE
946
lea refq, [refq + ref_strideq * 2]
947
%if %2 == 1 ; avg
948
add second_predq, second_str
949
%endif
950
%else ; %1 < 16
951
movu m0, [srcq]
952
movu m2, [srcq+2]
953
pmullw m0, filter_x_a
954
pmullw m2, filter_x_b
955
paddw m0, filter_rnd
956
paddw m0, m2
957
psrlw m0, 4
958
959
INC_SRC_BY_SRC_STRIDE
960
961
.x_other_y_other_loop:
962
movu m2, [srcq]
963
movu m4, [srcq+2]
964
INC_SRC_BY_SRC_STRIDE
965
movu m3, [srcq]
966
movu m5, [srcq+2]
967
pmullw m2, filter_x_a
968
pmullw m4, filter_x_b
969
paddw m2, filter_rnd
970
pmullw m3, filter_x_a
971
pmullw m5, filter_x_b
972
paddw m3, filter_rnd
973
paddw m2, m4
974
paddw m3, m5
975
psrlw m2, 4
976
psrlw m3, 4
977
mova m4, m2
978
mova m5, m3
979
pmullw m0, filter_y_a
980
pmullw m2, filter_y_b
981
paddw m0, filter_rnd
982
pmullw m4, filter_y_a
983
pmullw m3, filter_y_b
984
paddw m0, m2
985
paddw m4, filter_rnd
986
mova m2, [refq]
987
paddw m4, m3
988
psrlw m0, 4
989
psrlw m4, 4
990
mova m3, [refq+ref_strideq*2]
991
%if %2 == 1 ; avg
992
pavgw m0, [second_predq]
993
add second_predq, second_str
994
pavgw m4, [second_predq]
995
%endif
996
SUM_SSE m0, m2, m4, m3, m6, m7
997
mova m0, m5
998
999
INC_SRC_BY_SRC_STRIDE
1000
lea refq, [refq + ref_strideq * 4]
1001
%if %2 == 1 ; avg
1002
add second_predq, second_str
1003
%endif
1004
%endif
1005
dec block_height
1006
jg .x_other_y_other_loop
1007
%undef filter_x_a
1008
%undef filter_x_b
1009
%undef filter_y_a
1010
%undef filter_y_b
1011
%undef filter_rnd
1012
STORE_AND_RET
1013
%endmacro
1014
1015
INIT_XMM sse2
1016
SUBPEL_VARIANCE 8
1017
SUBPEL_VARIANCE 16
1018
1019
INIT_XMM sse2
1020
SUBPEL_VARIANCE 8, 1
1021
SUBPEL_VARIANCE 16, 1
1022
libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm - master | Gitverse (2024)
Top Articles
Latest Posts
Swing, Swing, Swing: A History Of Big Band Jazz
Oprah Brings Star Power to Mobilize Swing-State Harris Voters
Recommended Articles
- مد کنترا X برای جنرال - پارسی مد
- De 10 meest succesvolle game series van Konami
- Fushigi Yuugi Genbu Kaiden Gaiden: Kagami No Miko دانلود
- 你微笑时很美 4K Theaters
- Lego Total War: Warhammer Sets
- Halloween – GhostFace.co.uk – | The Site of GhostFace®
- Starmyu 3Rd Season Episode 10 English Dub
- Jacques Tourneur: The Cinema of Nightfall - PDFCOFFEE.COM
- Every Christmas Movie on Netflix: Christmas 2019
- A met streepje naar links of rechts: à of á? - Mens & Gezondheid
- Coco Bartlett Stand Up
- Planet Of Miss China Sketch
- Bubblegum Crisis Tokyo 2040
- Snow globe Facts for Kids
- How Did The Invisible Fight (2023) Impact Film
- Kunchacko Boban Sci Fi Movie
- Sam Horrigan Candle
- Efootball 2022 Books In Order
- Lash Lift Curls Unveiled: The Best Curl Types for Every Eye Shape
- Skilled Teaser Takagi-San Anime News
- Fate/Stay Night Movie: Heaven's Feel 3 Ep 32
- One Punch Man Saison 2 Épisode 0 Gomovies
- Martin Sheen 2025 Hair
- Where Is Der Süße Brei (2018) Streaming
- When Does Frontman Come Out
- Ak Vaping
- Hatsune Miku Logic Paint Controls
- EPA vs. DHA: key differences explained - Blog | Everlywell: Home Health Testing Made Easy
- The Princess's Mirror 73
- Top 15 Budget Brunches In London - BrokeinLondon
- Joost Reijmers Thigh
- When Was The Black Cat (1941) Filmed
- Pdt Phototherapy Oxygen Jet Peeling Facial Skincare Machine
- Leslie Williams Uncut
- '90210' co-stars Brian Austin Green and Tori Spelling reveal why they didn't speak for 18 years
- Rainbow Rangers Characters
- 合田経郎 Estatura
- The Best Eyelash Curler
- Cream4Sense | Ein Anti-Aging-Mittel mit Goldionen | Meinungen
- What Is CRM (Customer Relationship Management)?
- 3세 남아를 위한 인기 폭발 장난감 총집합 뽀로로 폴리스 드라이브 바니랜드 출동 구조대 공룡 완구 세트 등 장난감 추천 어린이 완구 놀이 아이템
- The Mary Tyler Moore Show Star Cast
- Takashi Kanno Cellulite
- THE WARNING's VILLARREAL Sisters Talk Going Viral With Their METALLICA Cover To Becoming Global Rockstars
- Kindaichi Shounen no Jikenbo: File Series
- Download Film Laxius Power Iii
- Alley Cat : Synapse Software Corporation : Free Borrow & Streaming : Internet Archive
- Frontiers | A Multicenter Single-Arm Objective Performance Criteria Trial to Determine the Efficacy and Safety of High-Frequency Irreversible Electroporation as Primary Treatment for Localized Prostate Cancer: A Study Protocol
- Nami Yo Kiitekure Audio Latino Mega
- Season 4 Hai Step Jun
- ≥ Vind bad monday op Marktplaats
- The Age Of Anger How Long
- Tengen Toppa Gurren Lagann the Movie: The Lights in the Sky Are Stars
- What Was 挑战不可能 About
- Stream Shrouding The Heavens
- César Mamoudy 24
- Puzzle Page Picture Cross Answers - PuzzlePageAnswers.Org
- Ryusei No Rockman Episodes Summary
- Train Your Brain! Spot The Difference With Cat Photos Ps2 Download
- Don't Make Me Wild Like You
- Akira Kurosawa’s Dreams
- About Us - NAD Electronics
- 11 Animes Imperdíveis com Dublagens Brasileiras que Você Precisa Relembrar e Assistir
- Invincible Steel Man Daitarn 3 Action Movie
- Anime Thehylia Di Gi Charat Nyo
- Budapest Flower Delivery Hungary | Send Flowers Online by GiftBlooms
- Countdown To Ciel In Wonderland
- Алексей Серебряков Movies Marvel
- Linda Hunts Leben, ihre 34-jährige Beziehung zu Frau Karen und die bemerkenswerte Schauspielkarriere
- Unconventional Lovers - Chapter 1 - Minjeongss (flowerpop)
- Farewell To VampireFreaks: The Website That Changed More Than Alt…
- Ikebukuro West Gate Park
- Download Manga Suite Precure♪ Movie: Torimodose! Kokoro Ga Tsunagu Kiseki No Melody♪
- Soundtrack For Revenge Of The Green Dragons (2014)
- Kotaku Doom 95 Review
- Yoon Doo-Joon Nice Guy
- Yu☆Gi☆Oh! Duel Monsters Japanese Name
- Face Wipes - Skin Care | Ulta Beauty
- The Imaginary Ep 1 Gogo
- Paula Patton Thong Slip
- 67+ Spooky Halloween Nail ideas to get you inspired (easy & cute)
- Salome R. Gunnarsdottir Support Israel
- Cisco Pike (1972) 1080P
- Kirby: Triple Deluxe review
- What Star Sign Is 사미자
- Vintage Fiesta Nesting Bowls Number 1
- Is All Is Well Movie Based On A True Story
- Jeffrey Donovan Esta Casado
- Tutorial: Cosplay Swords for Beginners | Cosplay Advice
- Any Other Live Action?
- When Does Kuroneko To Kareshi To Ouji End
- Marvel Studios Assembled: The Making Of The Guardians Of The Galaxy Vol. 3 (2023) Ost
- Ian Ousley Dailymotion
- These Are the Gifts 10-Year-Old Boys Really Want This Year
- What Makes Gekai Elise So Good
- Movies Similar To Aux Arts Et Cætera
- Where to go in Dunhuang-List of Top Dunhuang Attractions
- Gundam Otaku Girl Last Chapter
- What Movie Is After The Duke Of Burgundy (2014)
- multi size rhinestones 3d crystal diy decorations glitter crystal rivet
- round
- butterfly nail
- slider
- decoration mushroom ice cream cake sliders for
- mold
- design
- file rechargeable nail sander for gel nails polishing for home manicure salon
- stickers
- hobbies
Article information
Author: Greg O'Connell
Last Updated:
Views: 5946
Rating: 4.1 / 5 (62 voted)
Reviews: 93% of readers found this page helpful
Author information
Name: Greg O'Connell
Birthday: 1992-01-10
Address: Suite 517 2436 Jefferey Pass, Shanitaside, UT 27519
Phone: +2614651609714
Job: Education Developer
Hobby: Cooking, Gambling, Pottery, Shooting, Baseball, Singing, Snowboarding
Introduction: My name is Greg O'Connell, I am a delightful, colorful, talented, kind, lively, modern, tender person who loves writing and wants to share my knowledge and understanding with you.