root/branches/perian-1.0/ColorConversions.c

Revision 471, 12.1 kB (checked in by astrange, 2 years ago)

Add \alpha tag to SSA, and support "\\" typos.
Some -descriptions for debugging.
Make BGR24toRGB24() more like the other conversions.

Line 
1 /*
2  *  ColorConversions.h
3  *  Perian
4  *
5  *  Created by Alexander Strange on 1/10/07.
6  *  This library is free software; you can redistribute it and/or
7  *  modify it under the terms of the GNU Lesser General Public
8  *  License as published by the Free Software Foundation; either
9  *  version 2.1 of the License, or (at your option) any later version.
10  *
11  *  This library is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  *  Lesser General Public License for more details.
15  *
16  *  You should have received a copy of the GNU Lesser General Public
17  *  License along with this library; if not, write to the Free Software
18  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA */
19
20 #include "ColorConversions.h"
21 #include <QuickTime/QuickTime.h>
22 #include <Accelerate/Accelerate.h>
23 #include <sys/types.h>
24 #include <sys/sysctl.h>
25
26 //-----------------------------------------------------------------
27 // FastY420
28 //-----------------------------------------------------------------
29 // Returns y420 data directly to QuickTime which then converts
30 // in RGB for display
31 //-----------------------------------------------------------------
32
33 void FastY420(UInt8 *baseAddr, AVFrame *picture)
34 {
35     PlanarPixmapInfoYUV420 *planar;
36        
37         /*From Docs: PixMap baseAddr points to a big-endian PlanarPixmapInfoYUV420 struct; see ImageCodec.i. */
38     planar = (PlanarPixmapInfoYUV420 *) baseAddr;
39    
40     // if ya can't set da poiners, set da offsets
41     planar->componentInfoY.offset = EndianU32_NtoB(picture->data[0] - baseAddr);
42     planar->componentInfoCb.offset =  EndianU32_NtoB(picture->data[1] - baseAddr);
43     planar->componentInfoCr.offset =  EndianU32_NtoB(picture->data[2] - baseAddr);
44    
45     // for the 16/32 add look at EDGE in mpegvideo.c
46     planar->componentInfoY.rowBytes = EndianU32_NtoB(picture->linesize[0]);
47     planar->componentInfoCb.rowBytes = EndianU32_NtoB(picture->linesize[1]);
48     planar->componentInfoCr.rowBytes = EndianU32_NtoB(picture->linesize[2]);
49 }
50
51 //-----------------------------------------------------------------
52 // FFusionSlowDecompress
53 //-----------------------------------------------------------------
54 // We have to return 2yuv values because
55 // QT version has no built-in y420 component.
56 // Since we do the conversion ourselves it is not really optimized....
57 // The function should never be called since many people now
58 // have a decent OS/QT version.
59 //-----------------------------------------------------------------
60
61 #ifdef __BIG_ENDIAN__
62 //hand-unrolled code is a bad idea on modern CPUs. luckily, this does not run on modern CPUs, only G3s.
63 //also, big-endian only
64
65 static void Y420toY422_ppc_scalar(UInt8* baseAddr, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
66 {
67          unsigned             y = height / 2;
68          unsigned             halfWidth = width / 2, halfHalfWidth = width / 4;
69          UInt8          *inY = picture->data[0], *inU = picture->data[1], *inV = picture->data[2];
70          int             rB = picture->linesize[0], rbU = picture->linesize[1], rbV = picture->linesize[2];
71          
72          while (y--) {
73                  UInt32         *ldst = (UInt32 *) baseAddr, *ldstr2 = (UInt32 *) (baseAddr + outRB);
74                  UInt32         *lsrc = (UInt32 *) inY, *lsrcr2 = (UInt32 *) (inY + rB);
75                  UInt16         *sU = (UInt16 *) inU, *sV = (UInt16 *) inV;
76                  ptrdiff_t              off;
77                  
78                  for (off = 0; off < halfHalfWidth; off++) {
79                          UInt16          chrU = sU[off], chrV = sV[off];
80                          UInt32          row1luma = lsrc[off], row2luma = lsrcr2[off];
81                          UInt32          chromas1 = (chrU & 0xff00) << 16 | (chrV & 0xff00), chromas2 = (chrU & 0xff) << 24 | (chrV & 0xff) << 8;
82                          int             off2 = off * 2;
83                          
84                          ldst[off2] = chromas1 | (row1luma & 0xff000000) >> 8 | (row1luma & 0xff0000) >> 16;
85                          ldstr2[off2] = chromas1 | (row2luma & 0xff000000) >> 8 | (row2luma & 0xff0000) >> 16;
86                          off2++;
87                          ldst[off2] = chromas2 | (row1luma & 0xff00) << 8 | row1luma & 0xff;
88                          ldstr2[off2] = chromas2 | (row2luma & 0xff00) << 8 | row2luma & 0xff;
89                  }
90                  
91                  if (halfWidth % 4) {
92                          UInt16         *ssrc = (UInt16 *) inY, *ssrcr2 = (UInt16 *) (inY + rB);
93                          
94                          ptrdiff_t       off = halfWidth - 2;
95                          UInt32          chromas = inV[off] << 8 | (inU[off] << 24);
96                          UInt16          row1luma = ssrc[off], row2luma = ssrcr2[off];
97                          
98                          ldst[off] = chromas | row1luma & 0xff | (row1luma & 0xff00) << 8;
99                          ldstr2[off] = chromas | row2luma & 0xff | (row2luma & 0xff00) << 8;
100                  }
101                  inY += rB * 2;
102                  inU += rbU;
103                  inV += rbV;
104                  baseAddr += outRB * 2;
105          }
106  }
107
108 static void Y420toY422_ppc_altivec(UInt8 * o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
109 {
110         UInt8                   *yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2];
111         unsigned                rY = picture->linesize[0], rU = picture->linesize[1], rV = picture->linesize[2];
112         unsigned                y,x,x2,x4, vWidth = width / 32, halfheight = height / 2;
113        
114         for (y = 0; y < halfheight; y ++) {
115                 vUInt8 *ov = (vUInt8 *)o, *ov2 = (vUInt8 *)(o + outRB), *yv2 = (vUInt8 *)(yc + rY);
116                 vUInt8 *uv  = (vUInt8 *)uc, *vv = (vUInt8 *)vc, *yv = (vUInt8 *)yc;
117                
118                 for (x = 0; x < vWidth; x++) {
119                         x2 = x*2; x4 = x*4;
120                         // ldl/stl = mark data as least recently used in cache so they will be flushed out
121                         __builtin_prefetch(&yv[x+1], 0, 0); __builtin_prefetch(&yv2[x+1], 0, 0);
122                         __builtin_prefetch(&uv[x+1], 0, 0); __builtin_prefetch(&vv[x+1], 0, 0);
123                         vUInt8 tmp_u = vec_ldl(0, &uv[x]), tmp_v = vec_ldl(0, &vv[x]), chroma = vec_mergeh(tmp_u, tmp_v),
124                                         tmp_y = vec_ldl(0, &yv[x2]), tmp_y2 = vec_ldl(0, &yv2[x2]),
125                                         tmp_y3 = vec_ldl(16, &yv[x2]), tmp_y4 = vec_ldl(16, &yv2[x2]), chromal = vec_mergel(tmp_u, tmp_v);
126                        
127                         vec_stl(vec_mergeh(chroma, tmp_y), 0, &ov[x4]);
128                         vec_stl(vec_mergel(chroma, tmp_y), 16, &ov[x4]);
129                         vec_stl(vec_mergeh(chromal, tmp_y3), 32, &ov[x4]);
130                         vec_stl(vec_mergel(chromal, tmp_y3), 48, &ov[x4]);
131                        
132                         vec_stl(vec_mergeh(chroma, tmp_y2), 0, &ov2[x4]);
133                         vec_stl(vec_mergel(chroma, tmp_y2), 16, &ov2[x4]);
134                         vec_stl(vec_mergeh(chromal, tmp_y4), 32, &ov2[x4]);
135                         vec_stl(vec_mergel(chromal, tmp_y4), 48, &ov2[x4]);
136                 }
137                
138                 if (width % 32) { //spill to scalar for the end if the row isn't a multiple of 32
139                         UInt8 *o2 = o + outRB, *yc2 = yc + rY;
140                         for (x = vWidth * 32, x2 = x*2; x < width; x += 2, x2 += 4) {
141                                 unsigned             hx = x / 2;
142                                 o2[x2] = o[x2] = uc[hx];
143                                 o[x2 + 1] = yc[x];
144                                 o2[x2 + 1] = yc2[x];
145                                 o2[x2 + 2] = o[x2 + 2] = vc[hx];
146                                 o[x2 + 3] = yc[x + 1];
147                                 o2[x2 + 3] = yc2[x + 1];
148                         }                       
149                 }
150                
151                 o += outRB; o += outRB;
152                 yc += rY; yc += rY;
153                 uc += rU;
154                 vc += rV;
155         }
156 }
157
158 void Y420toY422(UInt8 * o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
159 {
160         static void (*y420_function)(UInt8* baseAddr, unsigned outRB, unsigned width, unsigned height, AVFrame * picture) = NULL;
161         if (!y420_function) {
162                 int sels[2] = { CTL_HW, HW_VECTORUNIT }; // from http://developer.apple.com/hardwaredrivers/ve/g3_compatibility.html
163                 int vType = 0; //0 == scalar only
164                 size_t length = sizeof(vType);
165                 int error = sysctl(sels, 2, &vType, &length, NULL, 0);
166                 if( 0 == error && vType ) y420_function = Y420toY422_ppc_altivec;
167                 else
168                 y420_function = Y420toY422_ppc_scalar;
169         }
170        
171         y420_function(o, outRB, width, height, picture);
172 }
173 #else
174 #include <emmintrin.h>
175
176 #define TRUNCATE(x, power) (x & ~(power-1))
177
178 static void Y420toY422_sse2(UInt8 * o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
179 {
180         UInt8          *yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2];
181         unsigned                rY = picture->linesize[0], rU = picture->linesize[1], rV = picture->linesize[2];
182         unsigned                y,x, vWidth = width / 16, halfheight = height / 2, halfwidth = width / 2;
183        
184         for (y = 0; y < halfheight; y ++) {
185                 __m128i *ov = (__m128i *)o, *ov2 = (__m128i *)(o + outRB), *yv2 = (__m128i *)(yc + rY);
186                 __m128i *yv = (__m128i *)yc;
187                 long long *uv = (long long *)uc, *vv = (long long*)vc;
188                
189                 for (x = 0; x < vWidth; x++) {
190                         /* read one chroma row, two luma rows, write two luma rows at once. this avoids reading chroma twice
191                         * sse2 can do 64-bit loads, so we do that. (apple's h264 doesn't seem to, maybe we should copy them?)
192                         * unrolling loops is very bad on x86 */
193                         unsigned x2 = x*2;
194 //                      __builtin_prefetch(&yv[x+1], 0, 0); __builtin_prefetch(&yv2[x+1], 0, 0); // prefetch next y vectors, throw it out of cache immediately after use
195 //                      __builtin_prefetch(&uv[x+1], 0, 0); __builtin_prefetch(&vv[x+1], 0, 0); // and chroma too
196                         __m128i tmp_y = yv[x],
197                                 tmp_y2 = yv2[x],
198                                 chroma = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)&uv[x]), _mm_loadl_epi64((__m128i*)&vv[x]));
199                         __m128i p1 = _mm_unpacklo_epi8(chroma, tmp_y),
200                                 p3 = _mm_unpacklo_epi8(chroma, tmp_y2),
201                                 p2 = _mm_unpackhi_epi8(chroma, tmp_y),
202                                 p4 = _mm_unpackhi_epi8(chroma, tmp_y2);
203                        
204                         _mm_stream_si128(&ov[x2],p1); // store to memory rather than cache
205                         _mm_stream_si128(&ov[x2+1],p2);
206                         _mm_stream_si128(&ov2[x2],p3);
207                         _mm_stream_si128(&ov2[x2+1],p4);
208                 }
209                
210                 if (__builtin_expect(width % 16 != 0, FALSE)) { //spill to scalar for the end if the row isn't a multiple of 16
211                         UInt8 *o2 = (UInt8*)ov2, *yc2 = (UInt8*)yv2;
212                         for (x = TRUNCATE(width, 16) / 2; x < halfwidth; x++) {
213                                 unsigned x4 = x*4, x2 = x*2;
214                                 o2[x4] = o[x4] = uc[x];
215                                 o[x4 + 1] = yc[x2];
216                                 o2[x4 + 1] = yc2[x2];
217                                 o2[x4 + 2] = o[x4 + 2] = vc[x];
218                                 o[x4 + 3] = yc[x2 + 1];
219                                 o2[x4 + 3] = yc2[x2 + 1];
220                         }                       
221                 }
222                
223                 o += outRB*2;
224                 yc += rY*2;
225                 uc += rU;
226                 vc += rV;
227         }
228         _mm_sfence(); // complete all writes (probably not really needed)
229 }
230
231
232 static void Y420toY422_x86_scalar(UInt8 * o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
233 {
234         UInt8           *yc = picture->data[0], *u = picture->data[1], *v = picture->data[2];
235         unsigned        rY = picture->linesize[0], rU = picture->linesize[1], rV = picture->linesize[2], halfheight = height / 2, halfwidth = width / 2;
236         unsigned        y, x;
237        
238         for (y = 0; y < halfheight; y ++) {
239                 UInt8 *o2 = o + outRB, *yc2 = yc + rY;
240                
241                 for (x = 0; x < halfwidth; x++) {
242                         unsigned x4 = x*4, x2 = x*2;
243                         o2[x4] = o[x4] = u[x];
244                         o[x4 + 1] = yc[x2];
245                         o2[x4 + 1] = yc2[x2];
246                         o2[x4 + 2] = o[x4 + 2] = v[x];
247                         o[x4 + 3] = yc[x2 + 1];
248                         o2[x4 + 3] = yc2[x2 + 1];
249                 }
250                
251                 o += outRB*2;
252                 yc += rY*2;
253                 u += rU;
254                 v += rV;
255         }
256 }
257
258 void Y420toY422(UInt8 * o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
259 {
260         uintptr_t yc = (uintptr_t)picture->data[0];
261
262         //make sure the ffmpeg picture buffers are aligned enough, they're only guaranteed to be 8-byte for some reason...
263         //if input y isn't 16 byte aligned, sse2 crashes
264
265         if (__builtin_expect((yc % 16) == 0 && (picture->linesize[0] % 16) == 0, TRUE)) Y420toY422_sse2(o, outRB, width, height, picture);
266         else Y420toY422_x86_scalar(o, outRB, width, height, picture);
267 }
268 #endif
269
270 void BGR24toRGB24(UInt8 *baseAddr, unsigned rowBytes, unsigned width, unsigned height, AVFrame *picture)
271 {
272         unsigned i, j;
273         UInt8 *srcPtr = picture->data[0];
274         unsigned width_third = width / 3;
275        
276         for (i = 0; i < height; ++i)
277         {
278                 for (j = 0; j < width_third; j ++)
279                 {
280                         unsigned j3 = j * 3;
281                         baseAddr[j3] = srcPtr[j3+2];
282                         baseAddr[j3+1] = srcPtr[j3+1];
283                         baseAddr[j3+2] = srcPtr[j3];
284                 }
285                 baseAddr += rowBytes;
286                 srcPtr += picture->linesize[0];
287         }
288 }
289
290 void RGB32toRGB32(UInt8 *baseAddr, unsigned rowBytes, unsigned width, unsigned height, AVFrame *picture)
291 {
292         unsigned y;
293         UInt8 *srcPtr = picture->data[0];
294        
295         for (y = 0; y < height; y++) {
296 #ifdef __BIG_ENDIAN__
297                 memcpy(baseAddr, srcPtr, width * 4);
298 #else
299                 unsigned x;
300                 UInt32 *oRow = (UInt32 *)baseAddr, *iRow = (UInt32 *)srcPtr;
301                 for (x = 0; x < width; x++) {oRow[x] = EndianU32_BtoN(iRow[x]);}
302 #endif
303                
304                 baseAddr += rowBytes;
305                 srcPtr += picture->linesize[0];
306         }
307 }
308
309 void Y422toY422(UInt8* o, unsigned outRB, unsigned width, unsigned height, AVFrame * picture)
310 {
311         UInt8          *yc = picture->data[0], *u = picture->data[1], *v = picture->data[2];
312         unsigned       rY = picture->linesize[0], rU = picture->linesize[1], rV = picture->linesize[2], y, x, halfwidth = width / 2;
313        
314         for (y = 0; y < height; y++) {
315                 for (x = 0; x < halfwidth; x++) {
316                         unsigned x2 = x * 2, x4 = x * 4;
317                         o[x4] = u[x];
318                         o[x4 + 1] = yc[x2];
319                         o[x4 + 2] = v[x];
320                         o[x4 + 3] = yc[x2 + 1];
321                 }
322                
323                 o += outRB;
324                 yc += rY;
325                 u += rU;
326                 v += rV;
327         }
328 }
Note: See TracBrowser for help on using the browser.