| 1 | | Index: libavcodec/i386/motion_est_mmx.c |
|---|
| 2 | | =================================================================== |
|---|
| 3 | | --- libavcodec/i386/motion_est_mmx.c (revision 10865) |
|---|
| 4 | | +++ libavcodec/i386/motion_est_mmx.c (working copy) |
|---|
| 5 | | @@ -167,7 +167,7 @@ |
|---|
| 6 | | static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
|---|
| 7 | | { |
|---|
| 8 | | asm volatile( |
|---|
| 9 | | - "movq "MANGLE(bone)", %%mm5 \n\t" |
|---|
| 10 | | + "movq %4, %%mm5 \n\t" |
|---|
| 11 | | "movq (%1), %%mm0 \n\t" |
|---|
| 12 | | "pavgb 1(%1), %%mm0 \n\t" |
|---|
| 13 | | "add %3, %1 \n\t" |
|---|
| 14 | | @@ -190,7 +190,7 @@ |
|---|
| 15 | | "sub $2, %0 \n\t" |
|---|
| 16 | | " jg 1b \n\t" |
|---|
| 17 | | : "+r" (h), "+r" (blk1), "+r" (blk2) |
|---|
| 18 | | - : "r" ((long)stride) |
|---|
| 19 | | + : "r" ((long)stride), "m" (bone) |
|---|
| 20 | | ); |
|---|
| 21 | | } |
|---|
| 22 | | |
|---|
| 23 | | @@ -258,7 +258,7 @@ |
|---|
| 24 | | "punpckhbw %%mm7, %%mm5 \n\t" |
|---|
| 25 | | "paddw %%mm4, %%mm2 \n\t" |
|---|
| 26 | | "paddw %%mm5, %%mm3 \n\t" |
|---|
| 27 | | - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" |
|---|
| 28 | | + "movq 16+%5, %%mm5 \n\t" |
|---|
| 29 | | "paddw %%mm2, %%mm0 \n\t" |
|---|
| 30 | | "paddw %%mm3, %%mm1 \n\t" |
|---|
| 31 | | "paddw %%mm5, %%mm0 \n\t" |
|---|
| 32 | | @@ -281,7 +281,7 @@ |
|---|
| 33 | | "add %4, %%"REG_a" \n\t" |
|---|
| 34 | | " js 1b \n\t" |
|---|
| 35 | | : "+a" (len) |
|---|
| 36 | | - : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride) |
|---|
| 37 | | + : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride), "m" (round_tab[0]) |
|---|
| 38 | | ); |
|---|
| 39 | | } |
|---|
| 40 | | |
|---|
| 41 | | Index: libavcodec/i386/dsputil_h264_template_mmx.c |
|---|
| 42 | | =================================================================== |
|---|
| 43 | | --- libavcodec/i386/dsputil_h264_template_mmx.c (revision 10865) |
|---|
| 44 | | +++ libavcodec/i386/dsputil_h264_template_mmx.c (working copy) |
|---|
| 45 | | @@ -188,8 +188,8 @@ |
|---|
| 46 | | "pxor %%mm7, %%mm7 \n\t" |
|---|
| 47 | | "movd %5, %%mm2 \n\t" |
|---|
| 48 | | "movd %6, %%mm3 \n\t" |
|---|
| 49 | | - "movq "MANGLE(ff_pw_8)", %%mm4\n\t" |
|---|
| 50 | | - "movq "MANGLE(ff_pw_8)", %%mm5\n\t" |
|---|
| 51 | | + "movq %7, %%mm4\n\t" |
|---|
| 52 | | + "movq %7, %%mm5\n\t" |
|---|
| 53 | | "punpcklwd %%mm2, %%mm2 \n\t" |
|---|
| 54 | | "punpcklwd %%mm3, %%mm3 \n\t" |
|---|
| 55 | | "punpcklwd %%mm2, %%mm2 \n\t" |
|---|
| 56 | | @@ -246,7 +246,7 @@ |
|---|
| 57 | | "sub $2, %2 \n\t" |
|---|
| 58 | | "jnz 1b \n\t" |
|---|
| 59 | | : "+r"(dst), "+r"(src), "+r"(h) |
|---|
| 60 | | - : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y) |
|---|
| 61 | | + : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y), "m"(ff_pw_8) |
|---|
| 62 | | ); |
|---|
| 63 | | } |
|---|
| 64 | | |
|---|
| 65 | | Index: libavcodec/i386/dsputil_mmx.c |
|---|
| 66 | | =================================================================== |
|---|
| 67 | | --- libavcodec/i386/dsputil_mmx.c (revision 10865) |
|---|
| 68 | | +++ libavcodec/i386/dsputil_mmx.c (working copy) |
|---|
| 69 | | @@ -1917,7 +1917,7 @@ |
|---|
| 70 | | |
|---|
| 71 | | #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
|---|
| 72 | | "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
|---|
| 73 | | - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
|---|
| 74 | | + "movq "#pw_20", %%mm4 \n\t" /* 20 */\ |
|---|
| 75 | | "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
|---|
| 76 | | "movq "#in7", " #m3 " \n\t" /* d */\ |
|---|
| 77 | | "movq "#in0", %%mm5 \n\t" /* D */\ |
|---|
| 78 | | @@ -1929,7 +1929,7 @@ |
|---|
| 79 | | "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ |
|---|
| 80 | | "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ |
|---|
| 81 | | "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ |
|---|
| 82 | | - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
|---|
| 83 | | + "pmullw "#pw_3", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
|---|
| 84 | | "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
|---|
| 85 | | "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
|---|
| 86 | | "psraw $5, %%mm5 \n\t"\ |
|---|
| 87 | | @@ -1963,10 +1963,10 @@ |
|---|
| 88 | | "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|---|
| 89 | | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|---|
| 90 | | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|---|
| 91 | | - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| 92 | | + "pmullw %8, %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| 93 | | "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|---|
| 94 | | "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|---|
| 95 | | - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|---|
| 96 | | + "pmullw %7, %%mm0 \n\t" /* 20a */\ |
|---|
| 97 | | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|---|
| 98 | | "paddw %6, %%mm6 \n\t"\ |
|---|
| 99 | | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| 100 | | @@ -1989,10 +1989,10 @@ |
|---|
| 101 | | "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ |
|---|
| 102 | | "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ |
|---|
| 103 | | "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ |
|---|
| 104 | | - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| 105 | | + "pmullw %8, %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| 106 | | "paddw %%mm2, %%mm1 \n\t" /* a */\ |
|---|
| 107 | | "paddw %%mm6, %%mm4 \n\t" /* d */\ |
|---|
| 108 | | - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|---|
| 109 | | + "pmullw %7, %%mm1 \n\t" /* 20a */\ |
|---|
| 110 | | "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
|---|
| 111 | | "paddw %6, %%mm1 \n\t"\ |
|---|
| 112 | | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
|---|
| 113 | | @@ -2015,7 +2015,7 @@ |
|---|
| 114 | | "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ |
|---|
| 115 | | "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ |
|---|
| 116 | | "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ |
|---|
| 117 | | - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
|---|
| 118 | | + "pmullw %8, %%mm0 \n\t" /* 3c - 6b */\ |
|---|
| 119 | | "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
|---|
| 120 | | "paddw %%mm3, %%mm2 \n\t" /* d */\ |
|---|
| 121 | | "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ |
|---|
| 122 | | @@ -2023,7 +2023,7 @@ |
|---|
| 123 | | "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ |
|---|
| 124 | | "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ |
|---|
| 125 | | "paddw %%mm2, %%mm6 \n\t" /* a */\ |
|---|
| 126 | | - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
|---|
| 127 | | + "pmullw %7, %%mm6 \n\t" /* 20a */\ |
|---|
| 128 | | "paddw %6, %%mm0 \n\t"\ |
|---|
| 129 | | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| 130 | | "psraw $5, %%mm0 \n\t"\ |
|---|
| 131 | | @@ -2038,8 +2038,8 @@ |
|---|
| 132 | | "paddw %%mm2, %%mm5 \n\t" /* d */\ |
|---|
| 133 | | "paddw %%mm6, %%mm6 \n\t" /* 2b */\ |
|---|
| 134 | | "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ |
|---|
| 135 | | - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
|---|
| 136 | | - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ |
|---|
| 137 | | + "pmullw %7, %%mm3 \n\t" /* 20a */\ |
|---|
| 138 | | + "pmullw %8, %%mm4 \n\t" /* 3c - 6b */\ |
|---|
| 139 | | "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
|---|
| 140 | | "paddw %6, %%mm4 \n\t"\ |
|---|
| 141 | | "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| 142 | | @@ -2052,7 +2052,9 @@ |
|---|
| 143 | | "decl %2 \n\t"\ |
|---|
| 144 | | " jnz 1b \n\t"\ |
|---|
| 145 | | : "+a"(src), "+c"(dst), "+m"(h)\ |
|---|
| 146 | | - : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
|---|
| 147 | | + : "d"((long)srcStride), "S"((long)dstStride),\ |
|---|
| 148 | | + "m"(temp), "m"(ROUNDER),\ |
|---|
| 149 | | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| 150 | | : "memory"\ |
|---|
| 151 | | );\ |
|---|
| 152 | | }\ |
|---|
| 153 | | @@ -2130,10 +2132,10 @@ |
|---|
| 154 | | "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|---|
| 155 | | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|---|
| 156 | | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|---|
| 157 | | - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| 158 | | + "pmullw %8, %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| 159 | | "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|---|
| 160 | | "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|---|
| 161 | | - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|---|
| 162 | | + "pmullw %7, %%mm0 \n\t" /* 20a */\ |
|---|
| 163 | | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|---|
| 164 | | "paddw %6, %%mm6 \n\t"\ |
|---|
| 165 | | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| 166 | | @@ -2151,8 +2153,8 @@ |
|---|
| 167 | | "paddw %%mm5, %%mm4 \n\t" /* d */\ |
|---|
| 168 | | "paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
|---|
| 169 | | "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
|---|
| 170 | | - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|---|
| 171 | | - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| 172 | | + "pmullw %7, %%mm1 \n\t" /* 20a */\ |
|---|
| 173 | | + "pmullw %8, %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| 174 | | "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
|---|
| 175 | | "paddw %6, %%mm1 \n\t"\ |
|---|
| 176 | | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| 177 | | @@ -2165,7 +2167,9 @@ |
|---|
| 178 | | "decl %2 \n\t"\ |
|---|
| 179 | | " jnz 1b \n\t"\ |
|---|
| 180 | | : "+a"(src), "+c"(dst), "+m"(h)\ |
|---|
| 181 | | - : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
|---|
| 182 | | + : "S"((long)srcStride), "D"((long)dstStride),\ |
|---|
| 183 | | + "m"(temp), "m"(ROUNDER),\ |
|---|
| 184 | | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| 185 | | : "memory"\ |
|---|
| 186 | | );\ |
|---|
| 187 | | }\ |
|---|
| 188 | | @@ -2244,31 +2248,31 @@ |
|---|
| 189 | | "movq 8(%0), %%mm1 \n\t"\ |
|---|
| 190 | | "movq 16(%0), %%mm2 \n\t"\ |
|---|
| 191 | | "movq 24(%0), %%mm3 \n\t"\ |
|---|
| 192 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| 193 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| 194 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| 195 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| 196 | | "add %4, %1 \n\t"\ |
|---|
| 197 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| 198 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| 199 | | \ |
|---|
| 200 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| 201 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| 202 | | "add %4, %1 \n\t"\ |
|---|
| 203 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| 204 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
|---|
| 205 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| 206 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
|---|
| 207 | | "add %4, %1 \n\t"\ |
|---|
| 208 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
|---|
| 209 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
|---|
| 210 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
|---|
| 211 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
|---|
| 212 | | "add %4, %1 \n\t"\ |
|---|
| 213 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
|---|
| 214 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
|---|
| 215 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
|---|
| 216 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
|---|
| 217 | | "add %4, %1 \n\t"\ |
|---|
| 218 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
|---|
| 219 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
|---|
| 220 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
|---|
| 221 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
|---|
| 222 | | "add %4, %1 \n\t"\ |
|---|
| 223 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
|---|
| 224 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
|---|
| 225 | | \ |
|---|
| 226 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
|---|
| 227 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
|---|
| 228 | | "add %4, %1 \n\t" \ |
|---|
| 229 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
|---|
| 230 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
|---|
| 231 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
|---|
| 232 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
|---|
| 233 | | \ |
|---|
| 234 | | "add $136, %0 \n\t"\ |
|---|
| 235 | | "add %6, %1 \n\t"\ |
|---|
| 236 | | @@ -2276,7 +2280,9 @@ |
|---|
| 237 | | " jnz 1b \n\t"\ |
|---|
| 238 | | \ |
|---|
| 239 | | : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
|---|
| 240 | | - : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
|---|
| 241 | | + : "r"((long)dstStride), "r"(2*(long)dstStride),\ |
|---|
| 242 | | + "m"(ROUNDER), "g"(4-14*(long)dstStride),\ |
|---|
| 243 | | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| 244 | | :"memory"\ |
|---|
| 245 | | );\ |
|---|
| 246 | | }\ |
|---|
| 247 | | @@ -2316,19 +2322,19 @@ |
|---|
| 248 | | "movq 8(%0), %%mm1 \n\t"\ |
|---|
| 249 | | "movq 16(%0), %%mm2 \n\t"\ |
|---|
| 250 | | "movq 24(%0), %%mm3 \n\t"\ |
|---|
| 251 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| 252 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| 253 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| 254 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| 255 | | "add %4, %1 \n\t"\ |
|---|
| 256 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| 257 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| 258 | | \ |
|---|
| 259 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| 260 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| 261 | | "add %4, %1 \n\t"\ |
|---|
| 262 | | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| 263 | | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| 264 | | \ |
|---|
| 265 | | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
|---|
| 266 | | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
|---|
| 267 | | "add %4, %1 \n\t"\ |
|---|
| 268 | | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
|---|
| 269 | | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
|---|
| 270 | | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
|---|
| 271 | | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
|---|
| 272 | | \ |
|---|
| 273 | | "add $72, %0 \n\t"\ |
|---|
| 274 | | "add %6, %1 \n\t"\ |
|---|
| 275 | | @@ -2336,7 +2342,9 @@ |
|---|
| 276 | | " jnz 1b \n\t"\ |
|---|
| 277 | | \ |
|---|
| 278 | | : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
|---|
| 279 | | - : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
|---|
| 280 | | + : "r"((long)dstStride), "r"(2*(long)dstStride),\ |
|---|
| 281 | | + "m"(ROUNDER), "g"(4-6*(long)dstStride),\ |
|---|
| 282 | | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| 283 | | : "memory"\ |
|---|
| 284 | | );\ |
|---|
| 285 | | }\ |
|---|
| 286 | | @@ -2967,7 +2975,6 @@ |
|---|
| 287 | | double c = 2.0 / (len-1.0); |
|---|
| 288 | | int n2 = len>>1; |
|---|
| 289 | | long i = -n2*sizeof(int32_t); |
|---|
| 290 | | - long j = n2*sizeof(int32_t); |
|---|
| 291 | | asm volatile( |
|---|
| 292 | | "movsd %0, %%xmm7 \n\t" |
|---|
| 293 | | "movapd %1, %%xmm6 \n\t" |
|---|
| 294 | | @@ -2985,17 +2992,18 @@ |
|---|
| 295 | | "movapd %%xmm6, %%xmm0 \n\t"\ |
|---|
| 296 | | "subpd %%xmm1, %%xmm0 \n\t"\ |
|---|
| 297 | | "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ |
|---|
| 298 | | - "cvtpi2pd (%4,%0), %%xmm2 \n\t"\ |
|---|
| 299 | | - "cvtpi2pd (%5,%1), %%xmm3 \n\t"\ |
|---|
| 300 | | + "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ |
|---|
| 301 | | "mulpd %%xmm0, %%xmm2 \n\t"\ |
|---|
| 302 | | + "movapd %%xmm2, (%1,%0,2) \n\t"\ |
|---|
| 303 | | + "negl %0\n\t"\ |
|---|
| 304 | | + "cvtpi2pd (%4,%0), %%xmm3 \n\t"\ |
|---|
| 305 | | "mulpd %%xmm1, %%xmm3 \n\t"\ |
|---|
| 306 | | - "movapd %%xmm2, (%2,%0,2) \n\t"\ |
|---|
| 307 | | - MOVPD" %%xmm3, (%3,%1,2) \n\t"\ |
|---|
| 308 | | + MOVPD" %%xmm3, (%2,%0,2) \n\t"\ |
|---|
| 309 | | "subpd %%xmm5, %%xmm7 \n\t"\ |
|---|
| 310 | | - "sub $8, %1 \n\t"\ |
|---|
| 311 | | + "negl %0\n\t"\ |
|---|
| 312 | | "add $8, %0 \n\t"\ |
|---|
| 313 | | "jl 1b \n\t"\ |
|---|
| 314 | | - :"+&r"(i), "+&r"(j)\ |
|---|
| 315 | | + :"+&r"(i)\ |
|---|
| 316 | | :"r"(w_data+n2), "r"(w_data+len-2-n2),\ |
|---|
| 317 | | "r"(data+n2), "r"(data+len-2-n2)\ |
|---|
| 318 | | ); |
|---|
| 319 | | Index: libavcodec/i386/h264dsp_mmx.c |
|---|
| 320 | | =================================================================== |
|---|
| 321 | | --- libavcodec/i386/h264dsp_mmx.c (revision 10865) |
|---|
| 322 | | +++ libavcodec/i386/h264dsp_mmx.c (working copy) |
|---|
| 323 | | @@ -341,21 +341,21 @@ |
|---|
| 324 | | // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) |
|---|
| 325 | | // out: mm1=p0' mm2=q0' |
|---|
| 326 | | // clobbers: mm0,3-6 |
|---|
| 327 | | -#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ |
|---|
| 328 | | +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3, pb_a1)\ |
|---|
| 329 | | "movq %%mm1 , %%mm5 \n\t"\ |
|---|
| 330 | | "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ |
|---|
| 331 | | "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ |
|---|
| 332 | | "pcmpeqb %%mm4 , %%mm4 \n\t"\ |
|---|
| 333 | | "pxor %%mm4 , %%mm3 \n\t"\ |
|---|
| 334 | | "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ |
|---|
| 335 | | - "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ |
|---|
| 336 | | + "pavgb "#pb_3" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ |
|---|
| 337 | | "pxor %%mm1 , %%mm4 \n\t"\ |
|---|
| 338 | | "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ |
|---|
| 339 | | "pavgb %%mm5 , %%mm3 \n\t"\ |
|---|
| 340 | | "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ |
|---|
| 341 | | - "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ |
|---|
| 342 | | + "movq "#pb_a1" , %%mm6 \n\t"\ |
|---|
| 343 | | "psubusb %%mm3 , %%mm6 \n\t"\ |
|---|
| 344 | | - "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ |
|---|
| 345 | | + "psubusb "#pb_a1" , %%mm3 \n\t"\ |
|---|
| 346 | | "pminub %%mm7 , %%mm6 \n\t"\ |
|---|
| 347 | | "pminub %%mm7 , %%mm3 \n\t"\ |
|---|
| 348 | | "psubusb %%mm6 , %%mm1 \n\t"\ |
|---|
| 349 | | @@ -422,14 +422,14 @@ |
|---|
| 350 | | H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) |
|---|
| 351 | | |
|---|
| 352 | | /* filter p0, q0 */ |
|---|
| 353 | | - H264_DEBLOCK_P0_Q0(%8, unused) |
|---|
| 354 | | + H264_DEBLOCK_P0_Q0(%8, %9, %10) |
|---|
| 355 | | "movq %%mm1, (%1,%3,2) \n\t" |
|---|
| 356 | | "movq %%mm2, (%2) \n\t" |
|---|
| 357 | | |
|---|
| 358 | | : "=m"(*tmp0) |
|---|
| 359 | | : "r"(pix-3*stride), "r"(pix), "r"((long)stride), |
|---|
| 360 | | "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), |
|---|
| 361 | | - "m"(mm_bone) |
|---|
| 362 | | + "m"(mm_bone), "m" (ff_pb_3), "m" (ff_pb_A1) |
|---|
| 363 | | ); |
|---|
| 364 | | } |
|---|
| 365 | | |
|---|
| 366 | | @@ -470,13 +470,13 @@ |
|---|
| 367 | | "movd %3, %%mm6 \n\t" |
|---|
| 368 | | "punpcklbw %%mm6, %%mm6 \n\t" |
|---|
| 369 | | "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask |
|---|
| 370 | | - H264_DEBLOCK_P0_Q0(%6, %7) |
|---|
| 371 | | + H264_DEBLOCK_P0_Q0(%6, %7, %8) |
|---|
| 372 | | "movq %%mm1, (%0,%2) \n\t" |
|---|
| 373 | | "movq %%mm2, (%1) \n\t" |
|---|
| 374 | | |
|---|
| 375 | | :: "r"(pix-2*stride), "r"(pix), "r"((long)stride), |
|---|
| 376 | | "r"(*(uint32_t*)tc0), |
|---|
| 377 | | - "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F) |
|---|
| 378 | | + "m"(alpha1), "m"(beta1), "m"(mm_bone), "m" (ff_pb_3), "m" (ff_pb_A1) |
|---|
| 379 | | ); |
|---|
| 380 | | } |
|---|
| 381 | | |
|---|
| 382 | | Index: libavcodec/i386/simple_idct_mmx.c |
|---|
| 383 | | =================================================================== |
|---|
| 384 | | --- libavcodec/i386/simple_idct_mmx.c (revision 10865) |
|---|
| 385 | | +++ libavcodec/i386/simple_idct_mmx.c (working copy) |
|---|
| 386 | | @@ -363,7 +363,7 @@ |
|---|
| 387 | | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
|---|
| 388 | | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
|---|
| 389 | | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
|---|
| 390 | | - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
|---|
| 391 | | + "movq %3, %%mm4 \n\t"\ |
|---|
| 392 | | "pand %%mm0, %%mm4 \n\t"\ |
|---|
| 393 | | "por %%mm1, %%mm4 \n\t"\ |
|---|
| 394 | | "por %%mm2, %%mm4 \n\t"\ |
|---|
| 395 | | @@ -437,7 +437,7 @@ |
|---|
| 396 | | "jmp 2f \n\t"\ |
|---|
| 397 | | "1: \n\t"\ |
|---|
| 398 | | "pslld $16, %%mm0 \n\t"\ |
|---|
| 399 | | - "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
|---|
| 400 | | + "#paddd %4, %%mm0 \n\t"\ |
|---|
| 401 | | "psrad $13, %%mm0 \n\t"\ |
|---|
| 402 | | "packssdw %%mm0, %%mm0 \n\t"\ |
|---|
| 403 | | "movq %%mm0, " #dst " \n\t"\ |
|---|
| 404 | | @@ -471,7 +471,7 @@ |
|---|
| 405 | | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
|---|
| 406 | | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
|---|
| 407 | | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
|---|
| 408 | | - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
|---|
| 409 | | + "movq %3, %%mm4 \n\t"\ |
|---|
| 410 | | "pand %%mm0, %%mm4 \n\t"\ |
|---|
| 411 | | "por %%mm1, %%mm4 \n\t"\ |
|---|
| 412 | | "por %%mm2, %%mm4 \n\t"\ |
|---|
| 413 | | @@ -545,7 +545,7 @@ |
|---|
| 414 | | "jmp 2f \n\t"\ |
|---|
| 415 | | "1: \n\t"\ |
|---|
| 416 | | "pslld $16, %%mm0 \n\t"\ |
|---|
| 417 | | - "paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
|---|
| 418 | | + "paddd %4, %%mm0 \n\t"\ |
|---|
| 419 | | "psrad $13, %%mm0 \n\t"\ |
|---|
| 420 | | "packssdw %%mm0, %%mm0 \n\t"\ |
|---|
| 421 | | "movq %%mm0, " #dst " \n\t"\ |
|---|
| 422 | | @@ -1270,7 +1270,7 @@ |
|---|
| 423 | | */ |
|---|
| 424 | | |
|---|
| 425 | | "9: \n\t" |
|---|
| 426 | | - :: "r" (block), "r" (temp), "r" (coeffs) |
|---|
| 427 | | + :: "r" (block), "r" (temp), "r" (coeffs), "m" (wm1010), "m"(d40000) |
|---|
| 428 | | : "%eax" |
|---|
| 429 | | ); |
|---|
| 430 | | } |
|---|
| 431 | | |
|---|
| 432 | | Index: libavcodec/dsputil.c |
|---|
| 433 | | =================================================================== |
|---|
| 434 | | --- libavcodec/dsputil.c (revision 11192) |
|---|
| 435 | | +++ libavcodec/dsputil.c (revision 11290) |
|---|
| | 1 | diff -ru ffmpeg/libavcodec/dsputil.c ffmpeg-/libavcodec/dsputil.c |
|---|
| | 2 | --- ffmpeg/libavcodec/dsputil.c 2008-01-20 15:05:32.000000000 -0500 |
|---|
| | 3 | +++ ffmpeg-/libavcodec/dsputil.c 2008-01-20 17:09:12.000000000 -0500 |
|---|
| | 94 | diff -ru ffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c ffmpeg-/libavcodec/i386/dsputil_h264_template_mmx.c |
|---|
| | 95 | --- ffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c 2008-01-20 15:05:31.000000000 -0500 |
|---|
| | 96 | +++ ffmpeg-/libavcodec/i386/dsputil_h264_template_mmx.c 2008-01-20 17:08:31.000000000 -0500 |
|---|
| | 97 | @@ -182,14 +182,26 @@ |
|---|
| | 98 | } |
|---|
| | 99 | } |
|---|
| | 100 | |
|---|
| | 101 | +extern void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); |
|---|
| | 102 | +extern void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); |
|---|
| | 103 | + |
|---|
| | 104 | static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) |
|---|
| | 105 | { |
|---|
| | 106 | + if (!(x*y)) { |
|---|
| | 107 | +#if H264_CHROMA_MC4_TMPL == put_h264_chroma_mc4_mmx |
|---|
| | 108 | + put_h264_chroma_mc4_c(dst,src,stride,h,x,y); |
|---|
| | 109 | +#else |
|---|
| | 110 | + avg_h264_chroma_mc4_c(dst,src,stride,h,x,y); |
|---|
| | 111 | +#endif |
|---|
| | 112 | + return; |
|---|
| | 113 | + } |
|---|
| | 114 | + |
|---|
| | 115 | asm volatile( |
|---|
| | 116 | "pxor %%mm7, %%mm7 \n\t" |
|---|
| | 117 | "movd %5, %%mm2 \n\t" |
|---|
| | 118 | "movd %6, %%mm3 \n\t" |
|---|
| | 119 | - "movq "MANGLE(ff_pw_8)", %%mm4\n\t" |
|---|
| | 120 | - "movq "MANGLE(ff_pw_8)", %%mm5\n\t" |
|---|
| | 121 | + "movq %7, %%mm4\n\t" |
|---|
| | 122 | + "movq %7, %%mm5\n\t" |
|---|
| | 123 | "punpcklwd %%mm2, %%mm2 \n\t" |
|---|
| | 124 | "punpcklwd %%mm3, %%mm3 \n\t" |
|---|
| | 125 | "punpcklwd %%mm2, %%mm2 \n\t" |
|---|
| | 126 | @@ -246,7 +258,7 @@ |
|---|
| | 127 | "sub $2, %2 \n\t" |
|---|
| | 128 | "jnz 1b \n\t" |
|---|
| | 129 | : "+r"(dst), "+r"(src), "+r"(h) |
|---|
| | 130 | - : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y) |
|---|
| | 131 | + : "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y), "m"(ff_pw_8) |
|---|
| | 132 | ); |
|---|
| | 133 | } |
|---|
| | 134 | |
|---|
| | 135 | diff -ru ffmpeg/libavcodec/i386/dsputil_mmx.c ffmpeg-/libavcodec/i386/dsputil_mmx.c |
|---|
| | 136 | --- ffmpeg/libavcodec/i386/dsputil_mmx.c 2008-01-20 15:05:31.000000000 -0500 |
|---|
| | 137 | +++ ffmpeg-/libavcodec/i386/dsputil_mmx.c 2008-01-20 17:03:15.000000000 -0500 |
|---|
| | 138 | @@ -1918,7 +1918,7 @@ |
|---|
| | 139 | |
|---|
| | 140 | #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
|---|
| | 141 | "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
|---|
| | 142 | - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
|---|
| | 143 | + "movq "#pw_20", %%mm4 \n\t" /* 20 */\ |
|---|
| | 144 | "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
|---|
| | 145 | "movq "#in7", " #m3 " \n\t" /* d */\ |
|---|
| | 146 | "movq "#in0", %%mm5 \n\t" /* D */\ |
|---|
| | 147 | @@ -1930,7 +1930,7 @@ |
|---|
| | 148 | "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ |
|---|
| | 149 | "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ |
|---|
| | 150 | "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ |
|---|
| | 151 | - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
|---|
| | 152 | + "pmullw "#pw_3", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
|---|
| | 153 | "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
|---|
| | 154 | "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
|---|
| | 155 | "psraw $5, %%mm5 \n\t"\ |
|---|
| | 156 | @@ -1964,10 +1964,10 @@ |
|---|
| | 157 | "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|---|
| | 158 | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|---|
| | 159 | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|---|
| | 160 | - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| | 161 | + "pmullw %8, %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| | 162 | "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|---|
| | 163 | "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|---|
| | 164 | - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|---|
| | 165 | + "pmullw %7, %%mm0 \n\t" /* 20a */\ |
|---|
| | 166 | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|---|
| | 167 | "paddw %6, %%mm6 \n\t"\ |
|---|
| | 168 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| | 169 | @@ -1990,10 +1990,10 @@ |
|---|
| | 170 | "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ |
|---|
| | 171 | "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ |
|---|
| | 172 | "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ |
|---|
| | 173 | - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| | 174 | + "pmullw %8, %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| | 175 | "paddw %%mm2, %%mm1 \n\t" /* a */\ |
|---|
| | 176 | "paddw %%mm6, %%mm4 \n\t" /* d */\ |
|---|
| | 177 | - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|---|
| | 178 | + "pmullw %7, %%mm1 \n\t" /* 20a */\ |
|---|
| | 179 | "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
|---|
| | 180 | "paddw %6, %%mm1 \n\t"\ |
|---|
| | 181 | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
|---|
| | 182 | @@ -2016,7 +2016,7 @@ |
|---|
| | 183 | "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ |
|---|
| | 184 | "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ |
|---|
| | 185 | "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ |
|---|
| | 186 | - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
|---|
| | 187 | + "pmullw %8, %%mm0 \n\t" /* 3c - 6b */\ |
|---|
| | 188 | "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
|---|
| | 189 | "paddw %%mm3, %%mm2 \n\t" /* d */\ |
|---|
| | 190 | "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ |
|---|
| | 191 | @@ -2024,7 +2024,7 @@ |
|---|
| | 192 | "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ |
|---|
| | 193 | "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ |
|---|
| | 194 | "paddw %%mm2, %%mm6 \n\t" /* a */\ |
|---|
| | 195 | - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
|---|
| | 196 | + "pmullw %7, %%mm6 \n\t" /* 20a */\ |
|---|
| | 197 | "paddw %6, %%mm0 \n\t"\ |
|---|
| | 198 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| | 199 | "psraw $5, %%mm0 \n\t"\ |
|---|
| | 200 | @@ -2039,8 +2039,8 @@ |
|---|
| | 201 | "paddw %%mm2, %%mm5 \n\t" /* d */\ |
|---|
| | 202 | "paddw %%mm6, %%mm6 \n\t" /* 2b */\ |
|---|
| | 203 | "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ |
|---|
| | 204 | - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
|---|
| | 205 | - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ |
|---|
| | 206 | + "pmullw %7, %%mm3 \n\t" /* 20a */\ |
|---|
| | 207 | + "pmullw %8, %%mm4 \n\t" /* 3c - 6b */\ |
|---|
| | 208 | "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
|---|
| | 209 | "paddw %6, %%mm4 \n\t"\ |
|---|
| | 210 | "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| | 211 | @@ -2053,7 +2053,9 @@ |
|---|
| | 212 | "decl %2 \n\t"\ |
|---|
| | 213 | " jnz 1b \n\t"\ |
|---|
| | 214 | : "+a"(src), "+c"(dst), "+m"(h)\ |
|---|
| | 215 | - : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
|---|
| | 216 | + : "d"((long)srcStride), "S"((long)dstStride),\ |
|---|
| | 217 | + "m"(temp), "m"(ROUNDER),\ |
|---|
| | 218 | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| | 219 | : "memory"\ |
|---|
| | 220 | );\ |
|---|
| | 221 | }\ |
|---|
| | 222 | @@ -2131,10 +2133,10 @@ |
|---|
| | 223 | "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|---|
| | 224 | "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|---|
| | 225 | "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|---|
| | 226 | - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| | 227 | + "pmullw %8, %%mm6 \n\t" /* 3c - 6b */\ |
|---|
| | 228 | "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|---|
| | 229 | "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|---|
| | 230 | - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|---|
| | 231 | + "pmullw %7, %%mm0 \n\t" /* 20a */\ |
|---|
| | 232 | "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|---|
| | 233 | "paddw %6, %%mm6 \n\t"\ |
|---|
| | 234 | "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| | 235 | @@ -2152,8 +2154,8 @@ |
|---|
| | 236 | "paddw %%mm5, %%mm4 \n\t" /* d */\ |
|---|
| | 237 | "paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
|---|
| | 238 | "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
|---|
| | 239 | - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|---|
| | 240 | - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| | 241 | + "pmullw %7, %%mm1 \n\t" /* 20a */\ |
|---|
| | 242 | + "pmullw %8, %%mm3 \n\t" /* 3c - 6b */\ |
|---|
| | 243 | "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
|---|
| | 244 | "paddw %6, %%mm1 \n\t"\ |
|---|
| | 245 | "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
|---|
| | 246 | @@ -2166,7 +2168,9 @@ |
|---|
| | 247 | "decl %2 \n\t"\ |
|---|
| | 248 | " jnz 1b \n\t"\ |
|---|
| | 249 | : "+a"(src), "+c"(dst), "+m"(h)\ |
|---|
| | 250 | - : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
|---|
| | 251 | + : "S"((long)srcStride), "D"((long)dstStride),\ |
|---|
| | 252 | + "m"(temp), "m"(ROUNDER),\ |
|---|
| | 253 | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| | 254 | : "memory"\ |
|---|
| | 255 | );\ |
|---|
| | 256 | }\ |
|---|
| | 257 | @@ -2245,31 +2249,31 @@ |
|---|
| | 258 | "movq 8(%0), %%mm1 \n\t"\ |
|---|
| | 259 | "movq 16(%0), %%mm2 \n\t"\ |
|---|
| | 260 | "movq 24(%0), %%mm3 \n\t"\ |
|---|
| | 261 | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| | 262 | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| | 263 | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| | 264 | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| | 265 | "add %4, %1 \n\t"\ |
|---|
| | 266 | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| | 267 | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| | 268 | \ |
|---|
| | 269 | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| | 270 | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| | 271 | "add %4, %1 \n\t"\ |
|---|
| | 272 | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| | 273 | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
|---|
| | 274 | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|---|
| | 275 | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
|---|
| | 276 | "add %4, %1 \n\t"\ |
|---|
| | 277 | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
|---|
| | 278 | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
|---|
| | 279 | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
|---|
| | 280 | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
|---|
| | 281 | "add %4, %1 \n\t"\ |
|---|
| | 282 | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
|---|
| | 283 | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
|---|
| | 284 | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
|---|
| | 285 | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
|---|
| | 286 | "add %4, %1 \n\t"\ |
|---|
| | 287 | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
|---|
| | 288 | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
|---|
| | 289 | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
|---|
| | 290 | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
|---|
| | 291 | "add %4, %1 \n\t"\ |
|---|
| | 292 | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
|---|
| | 293 | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
|---|
| | 294 | \ |
|---|
| | 295 | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
|---|
| | 296 | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
|---|
| | 297 | "add %4, %1 \n\t" \ |
|---|
| | 298 | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
|---|
| | 299 | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
|---|
| | 300 | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
|---|
| | 301 | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
|---|
| | 302 | \ |
|---|
| | 303 | "add $136, %0 \n\t"\ |
|---|
| | 304 | "add %6, %1 \n\t"\ |
|---|
| | 305 | @@ -2277,7 +2281,9 @@ |
|---|
| | 306 | " jnz 1b \n\t"\ |
|---|
| | 307 | \ |
|---|
| | 308 | : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
|---|
| | 309 | - : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
|---|
| | 310 | + : "r"((long)dstStride), "r"(2*(long)dstStride),\ |
|---|
| | 311 | + "m"(ROUNDER), "g"(4-14*(long)dstStride),\ |
|---|
| | 312 | + "m"(ff_pw_20), "m"(ff_pw_3)\ |
|---|
| | 313 | :"memory"\ |
|---|
| | 314 | );\ |
|---|
| | 315 | }\ |
|---|
| | 316 | @@ -2317,19 +2323,19 @@ |
|---|
| | 317 | "movq 8(%0), %%mm1 \n\t"\ |
|---|
| | 318 | "movq 16(%0), %%mm2 \n\t"\ |
|---|
| | 319 | "movq 24(%0), %%mm3 \n\t"\ |
|---|
| | 320 | - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| | 321 | - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| | 322 | + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|---|
| | 323 | + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|---|
| | 324 | "add %4, %1 \n\t"\ |
|---|
| | 325 | - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| | 326 | + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|---|
| | 327 | \ |
|---|
| | 328 | - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|---|
| | 329 | + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\&nbs |
|---|