Ticket #113: altivec_lum.4.diff

File altivec_lum.4.diff, 11.9 KB (added by gbooker, 7 years ago)

Some cleanup of function names and the like. Removed unused functions. Should be no functionality change.

  • libavcodec/ppc/h264_altivec.c

     
    524524    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); 
    525525} 
    526526 
     527/* A routine to read an unaligned vector.  Thanks for the example code Apple */ 
     528static inline vector unsigned char read_unaligned(int offset, uint8_t *src) 
     529{ 
     530        register vector unsigned char first = vec_ld(offset, src); 
     531        register vector unsigned char second = vec_ld(offset+15, src); 
     532        register vector unsigned char mask = vec_lvsl(offset, src); 
     533        return vec_perm(first, second, mask); 
     534} 
     535 
     536#define transpose4x16(r0, r1, r2, r3){\ 
     537        register vector unsigned char r4;\ 
     538        register vector unsigned char r5;\ 
     539        register vector unsigned char r6;\ 
     540        register vector unsigned char r7;\ 
     541        \ 
     542        r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/\ 
     543        r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/\ 
     544        r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/\ 
     545        r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/\ 
     546        \ 
     547        r0 = vec_mergeh(r4, r6);  /*all set 0*/\ 
     548        r1 = vec_mergel(r4, r6);  /*all set 1*/\ 
     549        r2 = vec_mergeh(r5, r7);  /*all set 2*/\ 
     550        r3 = vec_mergel(r5, r7);  /*all set 3*/\ 
     551} 
     552 
     553static inline void write16x4(uint8_t *dst, int dst_stride, register vector unsigned char r0, register vector unsigned char r1, register vector unsigned char r2, register vector unsigned char r3){ 
     554    unsigned char result[64] __attribute__((aligned(16))); 
     555        uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; 
     556        int int_dst_stride = dst_stride/4; 
     557     
     558    vec_st(r0, 0, result); 
     559    vec_st(r1, 16, result); 
     560    vec_st(r2, 32, result); 
     561    vec_st(r3, 48, result); 
     562    /* there has to be a better way!!!! */ 
     563        *dst_int = *src_int; 
     564        *(dst_int+int_dst_stride) = *(src_int + 1); 
     565        *(dst_int+2*int_dst_stride) = *(src_int + 2); 
     566        *(dst_int+3*int_dst_stride) = *(src_int + 3); 
     567        *(dst_int+4*int_dst_stride) = *(src_int + 4); 
     568        *(dst_int+5*int_dst_stride) = *(src_int + 5); 
     569        *(dst_int+6*int_dst_stride) = *(src_int + 6); 
     570        *(dst_int+7*int_dst_stride) = *(src_int + 7); 
     571        *(dst_int+8*int_dst_stride) = *(src_int + 8); 
     572        *(dst_int+9*int_dst_stride) = *(src_int + 9); 
     573        *(dst_int+10*int_dst_stride) = *(src_int + 10); 
     574        *(dst_int+11*int_dst_stride) = *(src_int + 11); 
     575        *(dst_int+12*int_dst_stride) = *(src_int + 12); 
     576        *(dst_int+13*int_dst_stride) = *(src_int + 13); 
     577        *(dst_int+14*int_dst_stride) = *(src_int + 14); 
     578        *(dst_int+15*int_dst_stride) = *(src_int + 15); 
     579} 
     580 
     581/* This function does an 6x16 transpose on data in src, and stores it in dst */ 
     582#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13){\ 
     583    register vector unsigned char r0 = read_unaligned(0, src);\ 
     584    register vector unsigned char r1 = read_unaligned(src_stride, src);\ 
     585    register vector unsigned char r2 = read_unaligned(2*src_stride, src);\ 
     586    register vector unsigned char r3 = read_unaligned(3*src_stride, src);\ 
     587    register vector unsigned char r4 = read_unaligned(4*src_stride, src);\ 
     588    register vector unsigned char r5 = read_unaligned(5*src_stride, src);\ 
     589    register vector unsigned char r6 = read_unaligned(6*src_stride, src);\ 
     590    register vector unsigned char r7 = read_unaligned(7*src_stride, src);\ 
     591    register vector unsigned char r14 = read_unaligned(14*src_stride, src);\ 
     592    register vector unsigned char r15 = read_unaligned(15*src_stride, src);\ 
     593\ 
     594        r8 = read_unaligned(8*src_stride, src);\ 
     595    r9 = read_unaligned(9*src_stride, src);\ 
     596    r10 = read_unaligned(10*src_stride, src);\ 
     597    r11 = read_unaligned(11*src_stride, src);\ 
     598    r12 = read_unaligned(12*src_stride, src);\ 
     599    r13 = read_unaligned(13*src_stride, src);\ 
     600    \ 
     601        /*Merge first pairs*/ \ 
     602    r0 = vec_mergeh(r0, r8);    /*0,8*/\ 
     603    r1 = vec_mergeh(r1, r9);    /*1,9*/\ 
     604    r2 = vec_mergeh(r2, r10);   /*2,10*/\ 
     605    r3 = vec_mergeh(r3, r11);   /*3,11*/\ 
     606    r4 = vec_mergeh(r4, r12);   /*4,12*/\ 
     607    r5 = vec_mergeh(r5, r13);   /*5,13*/\ 
     608    r6 = vec_mergeh(r6, r14);   /*6,14*/\ 
     609    r7 = vec_mergeh(r7, r15);   /*7,15*/\ 
     610    \ 
     611        /*Merge second pairs*/\ 
     612    r8 = vec_mergeh(r0, r4);    /*0,4,8,12 set 0*/\ 
     613    r9 = vec_mergel(r0, r4);    /*0,4,8,12 set 1*/\ 
     614    r10 = vec_mergeh(r1, r5);   /*1,5,9,13 set 0*/\ 
     615    r11 = vec_mergel(r1, r5);   /*1,5,9,13 set 1*/\ 
     616    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/\ 
     617    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/\ 
     618    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/\ 
     619    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/\ 
     620        \ 
     621        /*Third merge*/\ 
     622    r0 = vec_mergeh(r8, r12);   /*0,2,4,6,8,10,12,14 set 0*/\ 
     623    r1 = vec_mergel(r8, r12);   /*0,2,4,6,8,10,12,14 set 1*/\ 
     624    r2 = vec_mergeh(r9, r13);   /*0,2,4,6,8,10,12,14 set 2*/\ 
     625    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/\ 
     626    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/\ 
     627    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/\ 
     628        /* Don't need to compute 3 and 7*/\ 
     629        \ 
     630        /*Final merge*/\ 
     631    r8 = vec_mergeh(r0, r4);    /*all set 0*/\ 
     632    r9 = vec_mergel(r0, r4);    /*all set 1*/\ 
     633    r10 = vec_mergeh(r1, r5);   /*all set 2*/\ 
     634    r11 = vec_mergel(r1, r5);   /*all set 3*/\ 
     635    r12 = vec_mergeh(r2, r6);   /*all set 4*/\ 
     636    r13 = vec_mergel(r2, r6);   /*all set 5*/\ 
     637        /* Don't need to compute 14 and 15*/\ 
     638        \ 
     639} 
     640 
     641// out: o = |x-y| < a 
     642static inline vector unsigned char diff_lt_altivec( 
     643    register vector unsigned char x, 
     644    register vector unsigned char y, 
     645    register vector unsigned char a) { 
     646     
     647    register vector unsigned char diff = vec_subs(x, y); 
     648    register vector unsigned char diffneg = vec_subs(y, x); 
     649    register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ 
     650    o = vec_cmplt(o, a); 
     651    return o; 
     652} 
     653 
     654static inline vector unsigned char h264_deblock_mask( 
     655    register vector unsigned char p0, 
     656    register vector unsigned char p1, 
     657    register vector unsigned char q0, 
     658    register vector unsigned char q1, 
     659    register vector unsigned char alpha, 
     660    register vector unsigned char beta) { 
     661     
     662    register vector unsigned char mask; 
     663    register vector unsigned char tempmask; 
     664 
     665    mask = diff_lt_altivec(p0, q0, alpha); 
     666    tempmask = diff_lt_altivec(p1, p0, beta); 
     667    mask = vec_and(mask, tempmask); 
     668    tempmask = diff_lt_altivec(q1, q0, beta); 
     669    mask = vec_and(mask, tempmask); 
     670     
     671    return mask; 
     672} 
     673 
     674// out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) 
     675#define h264_deblock_q1(p0, p1, p2, q0, tc0){\ 
     676    \ 
     677    register vector unsigned char average = vec_avg(p0, q0);\ 
     678    register vector unsigned char temp;\ 
     679    register vector unsigned char uncliped;\ 
     680    register vector unsigned char ones;\ 
     681    register vector unsigned char max;\ 
     682    register vector unsigned char min;\ 
     683    \ 
     684    temp = vec_xor(average, p2);\ 
     685    average = vec_avg(average, p2);  /*avg(p2, avg(p0, q0)) */\ 
     686    ones = vec_splat_u8(1);\ 
     687    temp = vec_and(temp, ones);  /*(p2^avg(p0, q0)) & 1 */\ 
     688    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\ 
     689    max = vec_adds(p1, tc0);\ 
     690    min = vec_subs(p1, tc0);\ 
     691    p1 = vec_max(min, uncliped);\ 
     692    p1 = vec_min(max, p1);\ 
     693} 
     694 
     695#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked){\ 
     696        \ 
     697    const vec_u8_t A1v = (vec_u8_t) AVV(0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,\ 
     698                                        0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1);\ 
     699    \ 
     700    register vector unsigned char p0q0xorbit = vec_xor(p0, q0);\ 
     701    register vector unsigned char temp;\ 
     702    register vector unsigned char q1minus;\ 
     703    register vector unsigned char p0minus;\ 
     704    register vector unsigned char stage1;\ 
     705    register vector unsigned char stage2;\ 
     706    register vector unsigned char vec161;\ 
     707    register vector unsigned char delta;\ 
     708    register vector unsigned char deltaneg;\ 
     709    \ 
     710    p0q0xorbit = vec_and(p0q0xorbit, vec_splat_u8(1)); /* (p0^q0)&1 */\ 
     711    temp = vec_cmpeq(p0, p0);\ 
     712    q1minus = vec_xor(temp, q1); /* 255 - q1 */\ 
     713    stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */\ 
     714    stage2 = vec_avg(vec_splat_u8(3), stage1); /* (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 */\ 
     715    p0minus = vec_xor(temp, p0); /* 255 - p0 */\ 
     716    stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */\ 
     717    stage2 = vec_avg(stage2, p0q0xorbit); /* 33+(p1-q1)>>3 */\ 
     718    stage2 = vec_adds(stage2, stage1); /* d+128 + 33 */\ 
     719    vec161 = vec_ld(0, &A1v);\ 
     720    deltaneg = vec_subs(vec161, stage2); /* -d */\ 
     721        delta = vec_subs(stage2, vec161); /* d */\ 
     722    deltaneg = vec_min(tc0masked, deltaneg);\ 
     723    delta = vec_min(tc0masked, delta);\ 
     724    p0 = vec_subs(p0, deltaneg);\ 
     725    q0 = vec_subs(q0, delta);\ 
     726    p0 = vec_adds(p0, delta);\ 
     727    q0 = vec_adds(q0, deltaneg);\ 
     728} 
     729 
     730#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0){\ 
     731        unsigned char temp[16] __attribute__((aligned(16)));\ 
     732    register vector unsigned char alphavec;\ 
     733    register vector unsigned char betavec;\ 
     734    register vector unsigned char mask;\ 
     735    register vector unsigned char p1mask;\ 
     736    register vector unsigned char q1mask;\ 
     737    register vector char tc0vec;\ 
     738    register vector unsigned char finaltc0;\ 
     739    register vector unsigned char tc0masked;\ 
     740        \ 
     741        temp[0] = alpha;\ 
     742        temp[1] = beta;\ 
     743        alphavec = vec_ld(0, temp);\ 
     744    betavec = vec_splat(alphavec, 0x1);\ 
     745    alphavec = vec_splat(alphavec, 0x0);\ 
     746    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */\ 
     747    \ 
     748        *((int *)temp) = *((int *)tc0);\ 
     749        tc0vec = vec_ld(0, temp);\ 
     750    tc0vec = vec_mergeh(tc0vec, tc0vec);\ 
     751    tc0vec = vec_mergeh(tc0vec, tc0vec);\ 
     752    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1)));  /* if tc0[i] >= 0 */\ 
     753    finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/\ 
     754    \ 
     755    p1mask = diff_lt_altivec(p2, p0, betavec);\ 
     756    p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */\ 
     757    tc0masked = vec_and(p1mask, tc0vec);\ 
     758    finaltc0 = vec_sub(finaltc0, p1mask);  /* tc++ */\ 
     759    h264_deblock_q1(p0, p1, p2, q0, tc0masked);\ 
     760        /*end if*/\ 
     761    \ 
     762    q1mask = diff_lt_altivec(q2, q0, betavec);\ 
     763    q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ 
     764    tc0masked = vec_and(q1mask, tc0vec);\ 
     765    finaltc0 = vec_sub(finaltc0, q1mask);  /* tc++ */\ 
     766    h264_deblock_q1(p0, q1, q2, q0, tc0masked);\ 
     767    /*end if*/\ 
     768    \ 
     769    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);\ 
     770} 
     771 
     772static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 
     773{ 
     774    if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) 
     775        { 
     776                register vector unsigned char p2 = vec_ld(-3*stride, pix); 
     777                register vector unsigned char p1 = vec_ld(-2*stride, pix); 
     778                register vector unsigned char p0 = vec_ld(-1*stride, pix); 
     779                register vector unsigned char q0 = vec_ld(0, pix); 
     780                register vector unsigned char q1 = vec_ld(stride, pix); 
     781                register vector unsigned char q2 = vec_ld(2*stride, pix); 
     782        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); 
     783                vec_st(p1, -2*stride, pix); 
     784                vec_st(p0, -1*stride, pix); 
     785                vec_st(q0, 0, pix); 
     786                vec_st(q1, stride, pix); 
     787        } 
     788} 
     789 
     790static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 
     791{ 
     792    register vector unsigned char line0, line1, line2, line3, line4, line5; 
     793        if((tc0[0] & tc0[1] & tc0[2] * tc0[3]) < 0) 
     794                return; 
     795        readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); 
     796        h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); 
     797        transpose4x16(line1, line2, line3, line4); 
     798        write16x4(pix-2, stride, line1, line2, line3, line4); 
     799} 
     800 
    527801void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { 
    528802 
    529803#ifdef HAVE_ALTIVEC 
     
    532806    c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; 
    533807    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; 
    534808    c->h264_idct8_add = ff_h264_idct8_add_altivec; 
     809    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; 
     810    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; 
    535811 
    536812#define dspfunc(PFX, IDX, NUM) \ 
    537813    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \