Changeset 286
- Timestamp:
- 01/14/07 15:52:33 (2 years ago)
- Files:
-
- trunk/ColorConversions.c (modified) (5 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/ColorConversions.c
r285 r286 117 117 118 118 for (x = 0,x2 = 0,x4 =0; x < vWidth; x++, x2 += 2, x4 += 4) { 119 vUInt8 tmp_u = uv[x], tmp_v = vv[x], chroma = vec_mergeh(tmp_u, tmp_v), tmp_y = yv[x2], tmp_y2 = yv2[x2]; 119 __builtin_prefetch(&yv[x+1], 0, 0); __builtin_prefetch(&yv2[x+1], 0, 0); 120 __builtin_prefetch(&uv[x+1], 0, 0); __builtin_prefetch(&vv[x+1], 0, 0); 121 vUInt8 tmp_u = vec_ldl(0, &uv[x]), tmp_v = vec_ldl(0, &vv[x]), chroma = vec_mergeh(tmp_u, tmp_v), 122 tmp_y = vec_ldl(0, &yv[x2]), tmp_y2 = vec_ldl(0, &yv2[x2]), 123 tmp_y3 = vec_ldl(16, &yv[x2]), tmp_y4 = vec_ldl(16, &yv2[x2]), chromal = vec_mergel(tmp_u, tmp_v); 124 120 125 ov[x4] = vec_mergeh(chroma, tmp_y); 126 ov[x4+1] = vec_mergel(chroma, tmp_y); 127 ov[x4+2] = vec_mergeh(chromal, tmp_y3); 128 ov[x4+3] = vec_mergel(chromal, tmp_y3); 121 129 ov2[x4] = vec_mergeh(chroma, tmp_y2); 122 ov[x4+1] = vec_mergel(chroma, tmp_y);123 130 ov2[x4+1] = vec_mergel(chroma, tmp_y2); 124 chroma = vec_mergel(tmp_u, tmp_v); 125 tmp_y = yv[x2+1]; 126 tmp_y2 = yv2[x2+1]; 127 ov[x4+2] = vec_mergeh(chroma, tmp_y); 128 ov2[x4+2] = vec_mergeh(chroma, tmp_y2); 129 ov[x4+3] = vec_mergel(chroma, tmp_y); 130 ov2[x4+3] = vec_mergel(chroma, tmp_y2); 131 ov2[x4+2] = vec_mergeh(chromal, tmp_y4); 132 ov2[x4+3] = vec_mergel(chromal, tmp_y4); 131 133 } 132 134 … … 173 175 UInt8 *yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2]; 174 176 int rY = picture->linesize[0], rU = picture->linesize[1], rV = picture->linesize[2]; 175 int y,x, x2,vWidth = width >> 4, halfheight = height >> 1;177 int y,x, vWidth = width >> 4, halfheight = height >> 1; 176 178 177 179 for (y = 0; y < halfheight; y ++) { … … 180 182 long long *uv = (long long *)uc, *vv = (long long*)vc; 181 183 182 for (x = 0 ,x2 = 0; x < vWidth; x++, x2 += 2) {183 /* read one chroma row, two luma rows, write two luma rows at once. 184 * this avoids reading chroma twice but should we be doing strictly linear writes instead?185 * fun facts: 1. sse2 supports 64-bit as well as 128-bit loads, so we do that for chroma186 * 2: unrolling loops can be very bad. i think we could have done it here, but x86 is so OoO it doesn't really matter */184 for (x = 0; x < vWidth; x++) { 185 /* read one chroma row, two luma rows, write two luma rows at once. this avoids reading chroma twice 186 * sse2 can do 64-bit loads, so we do that. (apple's h264 doesn't seem to, maybe we should copy them?) 187 * unrolling loops is very bad on x86 */ 188 int x2 = x*2; 187 189 __builtin_prefetch(&yv[x+1], 0, 0); __builtin_prefetch(&yv2[x+1], 0, 0); // prefetch next y vectors, throw it out of cache immediately after use 188 190 __builtin_prefetch(&uv[x+1], 0, 0); __builtin_prefetch(&vv[x+1], 0, 0); // and chroma too … … 194 196 p3 = _mm_unpacklo_epi8(chroma, tmp_y2), 195 197 p4 = _mm_unpackhi_epi8(chroma, tmp_y2); 196 ov[x2] = p1; 197 ov[x2+1] = p2; 198 ov2[x2] = p3; 199 ov2[x2+1] = p4; 198 199 _mm_stream_si128(&ov[x2],p1); // store to memory rather than cache 200 _mm_stream_si128(&ov[x2+1],p2); 201 _mm_stream_si128(&ov2[x2],p3); 202 _mm_stream_si128(&ov2[x2+1],p4); 200 203 } 201 204 202 205 if (__builtin_expect(width % 16, FALSE)) { //spill to scalar for the end if the row isn't a multiple of 16 203 206 UInt8 *o2 = o + outRB, *yc2 = yc + rY; 204 for (x = vWidth * 16 , x2 = x*2; x < width; x += 2, x2 += 4) {205 int hx = x >> 1;207 for (x = vWidth * 16; x < width; x += 2) { 208 int hx = x>>1, x2 = x*2; 206 209 o2[x2] = o[x2] = uc[hx]; 207 210 o[x2 + 1] = yc[x]; … … 213 216 } 214 217 215 o += outRB ; o += outRB;216 yc += rY ; yc += rY;218 o += outRB*2; 219 yc += rY*2; 217 220 uc += rU; 218 221 vc += rV; 219 222 } 220 _mm_ empty(); // leave mmx mode223 _mm_sfence(); // complete all writes 221 224 } 222 225 #endif
