Ticket #113: ffmpeg-64bit-copies-2.diff

File ffmpeg-64bit-copies-2.diff, 2.0 kB (added by astrange, 2 years ago)

MMXify fill_rectangle. For some reason, gcc doesn't generate MMX code half the time with the intrinsics? Still faster anyway.

  • libavcodec/h264.c

    old new  
    3838//#undef NDEBUG 
    3939#include <assert.h> 
    4040 
     41#if defined(__MMX__) 
     42#include <mmintrin.h> 
     43#endif 
     44 
    4145#define interlaced_dct interlaced_dct_is_a_bad_name 
    4246#define mb_intra mb_intra_isnt_initalized_see_mb_type 
    4347 
     
    452456        *(uint32_t*)(p + 2*stride)= 
    453457        *(uint32_t*)(p + 3*stride)= v; 
    454458    }else if(w==8){ 
    455     //gcc can't optimize 64bit math on x86_32 
    456 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64) 
     459#if defined(__MMX__) 
     460        __m64 v= _mm_set1_pi32(val);  
     461        *(__m64*)(p + 0*stride)= v; 
     462        if(h==1) return; 
     463        *(__m64*)(p + 1*stride)= v; 
     464        if(h==2) return; 
     465        *(__m64*)(p + 2*stride)= 
     466        *(__m64*)(p + 3*stride)= v; 
     467    }else if(w==16){ 
     468        __m64 v= _mm_set1_pi32(val);  
     469        *(__m64*)(p + 0+0*stride)= 
     470        *(__m64*)(p + 8+0*stride)= 
     471        *(__m64*)(p + 0+1*stride)= 
     472        *(__m64*)(p + 8+1*stride)= v; 
     473        if(h==2) return; 
     474        *(__m64*)(p + 0+2*stride)= 
     475        *(__m64*)(p + 8+2*stride)= 
     476        *(__m64*)(p + 0+3*stride)= 
     477        *(__m64*)(p + 8+3*stride)= v; 
     478        //gcc can't optimize 64bit math on x86_32 
     479#elif defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)  
    457480        const uint64_t v= val*0x0100000001ULL; 
    458481        *(uint64_t*)(p + 0*stride)= v; 
    459482        if(h==1) return; 
     
    507530    assert(h==4); 
    508531} 
    509532 
     533#define uint64_t double 
     534 
    510535static void fill_caches(H264Context *h, int mb_type, int for_deblock){ 
    511536    MpegEncContext * const s = &h->s; 
    512537    const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; 
     
    70057030    } 
    70067031} 
    70077032 
     7033#undef uint64_t 
     7034 
    70087035static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) { 
    70097036    MpegEncContext * const s = &h->s; 
    70107037    int mb_xy, mb_type;