| | 527 | static inline vector unsigned char read_unaligned(int offset, uint8_t *src) |
|---|
| | 528 | { |
|---|
| | 529 | register vector unsigned char first = vec_ld(offset, src); |
|---|
| | 530 | register vector unsigned char second = vec_ld(offset+15, src); |
|---|
| | 531 | register vector unsigned char mask = vec_lvsl(offset, src); |
|---|
| | 532 | return vec_perm(first, second, mask); |
|---|
| | 533 | } |
|---|
| | 534 | |
|---|
| | 535 | static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
|---|
| | 536 | register vector unsigned char a = read_unaligned(0, src); |
|---|
| | 537 | register vector unsigned char b = read_unaligned(src_stride, src); |
|---|
| | 538 | register vector unsigned char c = read_unaligned(2*src_stride, src); |
|---|
| | 539 | register vector unsigned char d = read_unaligned(3*src_stride, src); |
|---|
| | 540 | unsigned char result[16] __attribute__((aligned(16))); |
|---|
| | 541 | |
|---|
| | 542 | a = vec_mergeh(a, c); //ac |
|---|
| | 543 | c = vec_mergeh(b, d); //bd |
|---|
| | 544 | a = vec_mergeh(a, c); //abcd |
|---|
| | 545 | vec_st(a, 0, result); |
|---|
| | 546 | /* there has to be a better way!!!! */ |
|---|
| | 547 | memcpy(dst, result, 4); |
|---|
| | 548 | memcpy(dst+dst_stride, result+4, 4); |
|---|
| | 549 | memcpy(dst+2*dst_stride, result+8, 4); |
|---|
| | 550 | memcpy(dst+3*dst_stride, result+12, 4); |
|---|
| | 551 | } |
|---|
| | 552 | |
|---|
| | 553 | /* This function does an 8x16 transpose on data where the pointer ends in 0xc*/ |
|---|
| | 554 | static inline void transpose8x16(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
|---|
| | 555 | register vector unsigned char r0 = read_unaligned(0, src); |
|---|
| | 556 | register vector unsigned char r1 = read_unaligned(src_stride, src); |
|---|
| | 557 | register vector unsigned char r2 = read_unaligned(2*src_stride, src); |
|---|
| | 558 | register vector unsigned char r3 = read_unaligned(3*src_stride, src); |
|---|
| | 559 | register vector unsigned char r4 = read_unaligned(4*src_stride, src); |
|---|
| | 560 | register vector unsigned char r5 = read_unaligned(5*src_stride, src); |
|---|
| | 561 | register vector unsigned char r6 = read_unaligned(6*src_stride, src); |
|---|
| | 562 | register vector unsigned char r7 = read_unaligned(7*src_stride, src); |
|---|
| | 563 | register vector unsigned char r8 = read_unaligned(8*src_stride, src); |
|---|
| | 564 | register vector unsigned char r9 = read_unaligned(9*src_stride, src); |
|---|
| | 565 | register vector unsigned char r10 = read_unaligned(10*src_stride, src); |
|---|
| | 566 | register vector unsigned char r11 = read_unaligned(11*src_stride, src); |
|---|
| | 567 | register vector unsigned char r12 = read_unaligned(12*src_stride, src); |
|---|
| | 568 | register vector unsigned char r13 = read_unaligned(13*src_stride, src); |
|---|
| | 569 | register vector unsigned char r14 = read_unaligned(14*src_stride, src); |
|---|
| | 570 | register vector unsigned char r15 = read_unaligned(15*src_stride, src); |
|---|
| | 571 | |
|---|
| | 572 | //Merge first pairs |
|---|
| | 573 | r0 = vec_mergeh(r0, r8); //0,8 |
|---|
| | 574 | r1 = vec_mergeh(r1, r9); //1,9 |
|---|
| | 575 | r2 = vec_mergeh(r2, r10); //2,10 |
|---|
| | 576 | r3 = vec_mergeh(r3, r11); //3,11 |
|---|
| | 577 | r4 = vec_mergeh(r4, r12); //4,12 |
|---|
| | 578 | r5 = vec_mergeh(r5, r13); //5,13 |
|---|
| | 579 | r6 = vec_mergeh(r6, r14); //6,14 |
|---|
| | 580 | r7 = vec_mergeh(r7, r15); //7,15 |
|---|
| | 581 | |
|---|
| | 582 | //Merge second pairs |
|---|
| | 583 | r8 = vec_mergeh(r0, r4); //0,4,8,12 set 0 |
|---|
| | 584 | r9 = vec_mergel(r0, r4); //0,4,8,12 set 1 |
|---|
| | 585 | r10 = vec_mergeh(r1, r5); //1,5,9,13 set 0 |
|---|
| | 586 | r11 = vec_mergel(r1, r5); //1,5,9,13 set 1 |
|---|
| | 587 | r12 = vec_mergeh(r2, r6); //2,6,10,14 set 0 |
|---|
| | 588 | r13 = vec_mergel(r2, r6); //2,6,10,14 set 1 |
|---|
| | 589 | r14 = vec_mergeh(r3, r7); //3,7,11,15 set 0 |
|---|
| | 590 | r15 = vec_mergel(r3, r7); //3,7,11,15 set 1 |
|---|
| | 591 | |
|---|
| | 592 | //Third merge |
|---|
| | 593 | r0 = vec_mergeh(r8, r12); //0,2,4,6,8,10,12,14 set 0 |
|---|
| | 594 | r1 = vec_mergel(r8, r12); //0,2,4,6,8,10,12,14 set 1 |
|---|
| | 595 | r2 = vec_mergeh(r9, r13); //0,2,4,6,8,10,12,14 set 2 |
|---|
| | 596 | r3 = vec_mergel(r9, r13); //0,2,4,6,8,10,12,14 set 3 |
|---|
| | 597 | r4 = vec_mergeh(r10, r14); //1,3,5,7,9,11,13,15 set 0 |
|---|
| | 598 | r5 = vec_mergel(r10, r14); //1,3,5,7,9,11,13,15 set 1 |
|---|
| | 599 | r6 = vec_mergeh(r11, r15); //1,3,5,7,9,11,13,15 set 2 |
|---|
| | 600 | r7 = vec_mergel(r11, r15); //1,3,5,7,9,11,13,15 set 3 |
|---|
| | 601 | |
|---|
| | 602 | //Final merge |
|---|
| | 603 | r8 = vec_mergeh(r0, r4); //all set 0 |
|---|
| | 604 | r9 = vec_mergel(r0, r4); //all set 1 |
|---|
| | 605 | r10 = vec_mergeh(r1, r5); //all set 2 |
|---|
| | 606 | r11 = vec_mergel(r1, r5); //all set 3 |
|---|
| | 607 | r12 = vec_mergeh(r2, r6); //all set 4 |
|---|
| | 608 | r13 = vec_mergel(r2, r6); //all set 5 |
|---|
| | 609 | r14 = vec_mergeh(r3, r7); //all set 6 |
|---|
| | 610 | r15 = vec_mergel(r3, r7); //all set 7 |
|---|
| | 611 | |
|---|
| | 612 | vec_st(r8, 0, dst); |
|---|
| | 613 | vec_st(r9, dst_stride, dst); |
|---|
| | 614 | vec_st(r10, 2*dst_stride, dst); |
|---|
| | 615 | vec_st(r11, 3*dst_stride, dst); |
|---|
| | 616 | vec_st(r12, 4*dst_stride, dst); |
|---|
| | 617 | vec_st(r13, 5*dst_stride, dst); |
|---|
| | 618 | vec_st(r14, 6*dst_stride, dst); |
|---|
| | 619 | vec_st(r15, 7*dst_stride, dst); |
|---|
| | 620 | } |
|---|
| | 621 | |
|---|
| | 622 | /*static inline void transpose8x8(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
|---|
| | 623 | unsigned char result[64] __attribute__((aligned(16))); |
|---|
| | 624 | |
|---|
| | 625 | transpose8x8aligned(result, src, 16, src_stride); |
|---|
| | 626 | |
|---|
| | 627 | /* there has to be a better way!!!! * |
|---|
| | 628 | memcpy(dst, result, 8); |
|---|
| | 629 | memcpy(dst+dst_stride, result+8, 8); |
|---|
| | 630 | memcpy(dst+2*dst_stride, result+16, 8); |
|---|
| | 631 | memcpy(dst+3*dst_stride, result+24, 8); |
|---|
| | 632 | memcpy(dst+4*dst_stride, result+32, 8); |
|---|
| | 633 | memcpy(dst+5*dst_stride, result+40, 8); |
|---|
| | 634 | memcpy(dst+6*dst_stride, result+48, 8); |
|---|
| | 635 | memcpy(dst+7*dst_stride, result+56, 8); |
|---|
| | 636 | }*/ |
|---|
| | 637 | |
|---|
| | 638 | // out: o = |x-y| < a |
|---|
| | 639 | static inline vector unsigned char diff_lt_altivec( |
|---|
| | 640 | register vector unsigned char x, |
|---|
| | 641 | register vector unsigned char y, |
|---|
| | 642 | register vector unsigned char a) { |
|---|
| | 643 | |
|---|
| | 644 | register vector unsigned char diff = vec_subs(x, y); |
|---|
| | 645 | register vector unsigned char diffneg = vec_subs(y, x); |
|---|
| | 646 | register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ |
|---|
| | 647 | o = vec_cmplt(o, a); |
|---|
| | 648 | return o; |
|---|
| | 649 | } |
|---|
| | 650 | |
|---|
| | 651 | static inline vector unsigned char h264_deblock_mask( |
|---|
| | 652 | register vector unsigned char p0, |
|---|
| | 653 | register vector unsigned char p1, |
|---|
| | 654 | register vector unsigned char q0, |
|---|
| | 655 | register vector unsigned char q1, |
|---|
| | 656 | register vector unsigned char alpha, |
|---|
| | 657 | register vector unsigned char beta) { |
|---|
| | 658 | |
|---|
| | 659 | register vector unsigned char mask; |
|---|
| | 660 | register vector unsigned char tempmask; |
|---|
| | 661 | |
|---|
| | 662 | mask = diff_lt_altivec(p0, q0, alpha); |
|---|
| | 663 | tempmask = diff_lt_altivec(p1, p0, beta); |
|---|
| | 664 | mask = vec_and(mask, tempmask); |
|---|
| | 665 | tempmask = diff_lt_altivec(q1, q0, beta); |
|---|
| | 666 | mask = vec_and(mask, tempmask); |
|---|
| | 667 | |
|---|
| | 668 | return mask; |
|---|
| | 669 | } |
|---|
| | 670 | |
|---|
| | 671 | static inline vector char readVector(int offset, int8_t *input) |
|---|
| | 672 | { |
|---|
| | 673 | unsigned int unalign = (offset + (unsigned int)input) & 0xf; |
|---|
| | 674 | if(unalign == 0) |
|---|
| | 675 | return vec_ld(offset, input); |
|---|
| | 676 | else |
|---|
| | 677 | { |
|---|
| | 678 | register vector char a = vec_ld(offset, input); |
|---|
| | 679 | register vector char b = vec_ld(offset+16, input); |
|---|
| | 680 | return vec_sld(a, b, 8); |
|---|
| | 681 | } |
|---|
| | 682 | } |
|---|
| | 683 | |
|---|
| | 684 | // out: *(output) = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
|---|
| | 685 | static inline void h264_deblock_q1( |
|---|
| | 686 | register vector unsigned char p0, |
|---|
| | 687 | register vector unsigned char p1, |
|---|
| | 688 | register vector unsigned char p2, |
|---|
| | 689 | register vector unsigned char q0, |
|---|
| | 690 | register vector unsigned char tc0, |
|---|
| | 691 | int8_t *output){ |
|---|
| | 692 | |
|---|
| | 693 | register vector unsigned char average = vec_avg(p0, q0); |
|---|
| | 694 | register vector unsigned char temp; |
|---|
| | 695 | register vector unsigned char uncliped; |
|---|
| | 696 | register vector unsigned char ones; |
|---|
| | 697 | register vector unsigned char max; |
|---|
| | 698 | register vector unsigned char min; |
|---|
| | 699 | register vector unsigned char final; |
|---|
| | 700 | /* int i; |
|---|
| | 701 | unsigned char p0sv[16] __attribute__((aligned(16))); |
|---|
| | 702 | unsigned char p1sv[16] __attribute__((aligned(16))); |
|---|
| | 703 | unsigned char p2sv[16] __attribute__((aligned(16))); |
|---|
| | 704 | unsigned char q0sv[16] __attribute__((aligned(16))); |
|---|
| | 705 | unsigned char tc0sv[16] __attribute__((aligned(16)));*/ |
|---|
| | 706 | |
|---|
| | 707 | temp = vec_xor(average, p2); |
|---|
| | 708 | average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ |
|---|
| | 709 | ones = vec_splat_u8(1); |
|---|
| | 710 | temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ |
|---|
| | 711 | uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ |
|---|
| | 712 | max = vec_adds(p1, tc0); |
|---|
| | 713 | min = vec_subs(p1, tc0); |
|---|
| | 714 | final = vec_max(min, uncliped); |
|---|
| | 715 | final = vec_min(max, final); |
|---|
| | 716 | vec_st(final, 0, output); |
|---|
| | 717 | |
|---|
| | 718 | /* vec_st(p0, 0, p0sv); |
|---|
| | 719 | vec_st(p1, 0, p1sv); |
|---|
| | 720 | vec_st(p2, 0, p2sv); |
|---|
| | 721 | vec_st(q0, 0, q0sv); |
|---|
| | 722 | vec_st(tc0, 0, tc0sv); |
|---|
| | 723 | for(i=0; i<16; i++) |
|---|
| | 724 | { |
|---|
| | 725 | int p0s = p0sv[i]; |
|---|
| | 726 | int p1s = p1sv[i]; |
|---|
| | 727 | int p2s = p2sv[i]; |
|---|
| | 728 | int q0s = q0sv[i]; |
|---|
| | 729 | int tc0s = tc0sv[i]; |
|---|
| | 730 | char val = p1s + av_clip( (( p2s + ( ( p0s + q0s + 1) >> 1 ) ) >> 1) -p1s, -tc0s, tc0s); |
|---|
| | 731 | if(val != output[i]) |
|---|
| | 732 | output[i] = val; |
|---|
| | 733 | }*/ |
|---|
| | 734 | } |
|---|
| | 735 | |
|---|
| | 736 | static inline void h264_deblock_p0_q0( |
|---|
| | 737 | register vector unsigned char p0, |
|---|
| | 738 | register vector unsigned char p1, |
|---|
| | 739 | register vector unsigned char q0, |
|---|
| | 740 | register vector unsigned char q1, |
|---|
| | 741 | register vector unsigned char tc0masked, |
|---|
| | 742 | int8_t *p0loc, |
|---|
| | 743 | int8_t *q0loc){ |
|---|
| | 744 | |
|---|
| | 745 | const vec_u8_t A1v = (vec_u8_t) AVV(0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1, |
|---|
| | 746 | 0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1); |
|---|
| | 747 | |
|---|
| | 748 | register vector unsigned char p0q0xorbit = vec_xor(p0, q0); |
|---|
| | 749 | register vector unsigned char temp; |
|---|
| | 750 | register vector unsigned char q1minus; |
|---|
| | 751 | register vector unsigned char p0minus; |
|---|
| | 752 | register vector unsigned char stage1; |
|---|
| | 753 | register vector unsigned char stage2; |
|---|
| | 754 | register vector unsigned char vec161; |
|---|
| | 755 | register vector unsigned char newp0; |
|---|
| | 756 | register vector unsigned char newq0; |
|---|
| | 757 | register vector unsigned char delta; |
|---|
| | 758 | register vector unsigned char deltaneg; |
|---|
| | 759 | |
|---|
| | 760 | p0q0xorbit = vec_and(p0q0xorbit, vec_splat_u8(1)); /* (p0^q0)&1 */ |
|---|
| | 761 | temp = vec_cmpeq(p0, p0); |
|---|
| | 762 | q1minus = vec_xor(temp, q1); /* 255 - q1 */ |
|---|
| | 763 | stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ |
|---|
| | 764 | stage2 = vec_avg(vec_splat_u8(3), stage1); /* (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 */ |
|---|
| | 765 | p0minus = vec_xor(temp, p0); /* 255 - p0 */ |
|---|
| | 766 | stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ |
|---|
| | 767 | stage2 = vec_avg(stage2, p0q0xorbit); /* 33+(p1-q1)>>3 */ |
|---|
| | 768 | stage2 = vec_adds(stage2, stage1); /* d+128 + 33 */ |
|---|
| | 769 | vec161 = vec_ld(0, &A1v); |
|---|
| | 770 | deltaneg = vec_subs(vec161, stage2); /* -d */ |
|---|
| | 771 | delta = vec_subs(stage2, vec161); /* d */ |
|---|
| | 772 | deltaneg = vec_min(tc0masked, deltaneg); |
|---|
| | 773 | delta = vec_min(tc0masked, delta); |
|---|
| | 774 | newp0 = vec_subs(p0, deltaneg); |
|---|
| | 775 | newq0 = vec_subs(q0, delta); |
|---|
| | 776 | newp0 = vec_adds(newp0, delta); |
|---|
| | 777 | newq0 = vec_adds(newq0, deltaneg); |
|---|
| | 778 | vec_st(newp0, 0, p0loc); |
|---|
| | 779 | vec_st(newq0, 0, q0loc); |
|---|
| | 780 | } |
|---|
| | 781 | |
|---|
| | 782 | static inline void h264_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
|---|
| | 783 | { |
|---|
| | 784 | unsigned char temp[16] __attribute__((aligned(16))); |
|---|
| | 785 | // unsigned char temp2[16] __attribute__((aligned(16))); |
|---|
| | 786 | register vector unsigned char p0 = vec_ld(-1*stride, pix); |
|---|
| | 787 | register vector unsigned char p1 = vec_ld(-2*stride, pix); |
|---|
| | 788 | register vector unsigned char p2; |
|---|
| | 789 | register vector unsigned char q0 = vec_ld(0, pix); |
|---|
| | 790 | register vector unsigned char q1 = vec_ld(1*stride, pix); |
|---|
| | 791 | register vector unsigned char q2; |
|---|
| | 792 | register vector unsigned char alphavec; |
|---|
| | 793 | register vector unsigned char betavec; |
|---|
| | 794 | register vector unsigned char mask; |
|---|
| | 795 | register vector unsigned char p1mask; |
|---|
| | 796 | register vector unsigned char q1mask; |
|---|
| | 797 | register vector char tc0vec; |
|---|
| | 798 | register vector unsigned char finaltc0; |
|---|
| | 799 | register vector unsigned char tc0masked; |
|---|
| | 800 | |
|---|
| | 801 | // int i; |
|---|
| | 802 | |
|---|
| | 803 | temp[0] = alpha; |
|---|
| | 804 | temp[1] = beta; |
|---|
| | 805 | alphavec = vec_ld(0, temp); |
|---|
| | 806 | betavec = vec_splat(alphavec, 0x1); |
|---|
| | 807 | alphavec = vec_splat(alphavec, 0x0); |
|---|
| | 808 | mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ |
|---|
| | 809 | /* vec_st(mask, 0, temp); |
|---|
| | 810 | for(i=0; i<16; i++) |
|---|
| | 811 | { |
|---|
| | 812 | int p0s = pix[i-stride]; |
|---|
| | 813 | int p1s = pix[i-2*stride]; |
|---|
| | 814 | int q0s = pix[i]; |
|---|
| | 815 | int q1s = pix[i+stride]; |
|---|
| | 816 | unsigned char val = 00; |
|---|
| | 817 | if(FFABS( p0s - q0s ) < alpha && |
|---|
| | 818 | FFABS( p1s - p0s ) < beta && |
|---|
| | 819 | FFABS( q1s - q0s ) < beta) |
|---|
| | 820 | val = 0xff; |
|---|
| | 821 | if(val != temp[i]) |
|---|
| | 822 | temp[i] = 0x80; |
|---|
| | 823 | }*/ |
|---|
| | 824 | *((int *)temp) = *((int *)tc0); |
|---|
| | 825 | tc0vec = vec_ld(0, temp); |
|---|
| | 826 | tc0vec = vec_mergeh(tc0vec, tc0vec); |
|---|
| | 827 | tc0vec = vec_mergeh(tc0vec, tc0vec); |
|---|
| | 828 | mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ |
|---|
| | 829 | /* vec_st(mask, 0, temp); |
|---|
| | 830 | for(i=0; i<16; i++) |
|---|
| | 831 | { |
|---|
| | 832 | int p0s = pix[i-stride]; |
|---|
| | 833 | int p1s = pix[i-2*stride]; |
|---|
| | 834 | int q0s = pix[i]; |
|---|
| | 835 | int q1s = pix[i+stride]; |
|---|
| | 836 | unsigned char val = 00; |
|---|
| | 837 | if(FFABS( p0s - q0s ) < alpha && |
|---|
| | 838 | FFABS( p1s - p0s ) < beta && |
|---|
| | 839 | FFABS( q1s - q0s ) < beta && |
|---|
| | 840 | tc0[i>>2] >= 0) |
|---|
| | 841 | val = 0xff; |
|---|
| | 842 | if(val != temp[i]) |
|---|
| | 843 | temp[i] = 0x80; |
|---|
| | 844 | }*/ |
|---|
| | 845 | finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ |
|---|
| | 846 | |
|---|
| | 847 | p2 = vec_ld(-3*stride, pix); |
|---|
| | 848 | p1mask = diff_lt_altivec(p2, p0, betavec); |
|---|
| | 849 | p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ |
|---|
| | 850 | /* vec_st(p1mask, 0, temp); |
|---|
| | 851 | for(i=0; i<16; i++) |
|---|
| | 852 | { |
|---|
| | 853 | int p0s = pix[i-stride]; |
|---|
| | 854 | int p1s = pix[i-2*stride]; |
|---|
| | 855 | int p2s = pix[i-3*stride]; |
|---|
| | 856 | int q0s = pix[i]; |
|---|
| | 857 | int q1s = pix[i+stride]; |
|---|
| | 858 | unsigned char val = 00; |
|---|
| | 859 | if(FFABS( p0s - q0s ) < alpha && |
|---|
| | 860 | FFABS( p1s - p0s ) < beta && |
|---|
| | 861 | FFABS( q1s - q0s ) < beta && |
|---|
| | 862 | FFABS( p2s - p0s ) < beta && |
|---|
| | 863 | tc0[i>>2] >= 0) |
|---|
| | 864 | val = 0xff; |
|---|
| | 865 | if(val != temp[i]) |
|---|
| | 866 | temp[i] = 0x80; |
|---|
| | 867 | }*/ |
|---|
| | 868 | tc0masked = vec_and(p1mask, tc0vec); |
|---|
| | 869 | finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ |
|---|
| | 870 | h264_deblock_q1(p0, p1, p2, q0, tc0masked, pix-2*stride); |
|---|
| | 871 | /*end if*/ |
|---|
| | 872 | |
|---|
| | 873 | q2 = vec_ld(2*stride, pix); |
|---|
| | 874 | q1mask = diff_lt_altivec(q2, q0, betavec); |
|---|
| | 875 | q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */ |
|---|
| | 876 | /* vec_st(q1mask, 0, temp); |
|---|
| | 877 | vec_st(p1, 0, temp2); |
|---|
| | 878 | for(i=0; i<16; i++) |
|---|
| | 879 | { |
|---|
| | 880 | int p0s = pix[i-stride]; |
|---|
| | 881 | int p1s = temp2[i]; |
|---|
| | 882 | int q0s = pix[i]; |
|---|
| | 883 | int q1s = pix[i+stride]; |
|---|
| | 884 | int q2s = pix[i+2*stride]; |
|---|
| | 885 | unsigned char val = 00; |
|---|
| | 886 | if(FFABS( p0s - q0s ) < alpha && |
|---|
| | 887 | FFABS( p1s - p0s ) < beta && |
|---|
| | 888 | FFABS( q1s - q0s ) < beta && |
|---|
| | 889 | FFABS( q2s - q0s ) < beta && |
|---|
| | 890 | tc0[i>>2] >= 0) |
|---|
| | 891 | val = 0xff; |
|---|
| | 892 | if(val != temp[i]) |
|---|
| | 893 | temp[i] = 0x80; |
|---|
| | 894 | }*/ |
|---|
| | 895 | tc0masked = vec_and(q1mask, tc0vec); |
|---|
| | 896 | finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ |
|---|
| | 897 | h264_deblock_q1(p0, q1, q2, q0, tc0masked, pix+stride); |
|---|
| | 898 | /*end if*/ |
|---|
| | 899 | |
|---|
| | 900 | h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0, pix-stride, pix); |
|---|
| | 901 | } |
|---|
| | 902 | |
|---|
| | 903 | static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0) |
|---|
| | 904 | { |
|---|
| | 905 | int i, d; |
|---|
| | 906 | for( i = 0; i < 4; i++ ) { |
|---|
| | 907 | if( tc0[i] < 0 ) { |
|---|
| | 908 | pix += 4*ystride; |
|---|
| | 909 | continue; |
|---|
| | 910 | } |
|---|
| | 911 | for( d = 0; d < 4; d++ ) { |
|---|
| | 912 | const int p0 = pix[-1*xstride]; |
|---|
| | 913 | const int p1 = pix[-2*xstride]; |
|---|
| | 914 | const int p2 = pix[-3*xstride]; |
|---|
| | 915 | const int q0 = pix[0]; |
|---|
| | 916 | const int q1 = pix[1*xstride]; |
|---|
| | 917 | const int q2 = pix[2*xstride]; |
|---|
| | 918 | |
|---|
| | 919 | if( FFABS( p0 - q0 ) < alpha && |
|---|
| | 920 | FFABS( p1 - p0 ) < beta && |
|---|
| | 921 | FFABS( q1 - q0 ) < beta ) { |
|---|
| | 922 | |
|---|
| | 923 | int tc = tc0[i]; |
|---|
| | 924 | int i_delta; |
|---|
| | 925 | |
|---|
| | 926 | if( FFABS( p2 - p0 ) < beta ) { |
|---|
| | 927 | pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); |
|---|
| | 928 | tc++; |
|---|
| | 929 | } |
|---|
| | 930 | if( FFABS( q2 - q0 ) < beta ) { |
|---|
| | 931 | pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); |
|---|
| | 932 | tc++; |
|---|
| | 933 | } |
|---|
| | 934 | |
|---|
| | 935 | i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); |
|---|
| | 936 | pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */ |
|---|
| | 937 | pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */ |
|---|
| | 938 | } |
|---|
| | 939 | pix += ystride; |
|---|
| | 940 | } |
|---|
| | 941 | } |
|---|
| | 942 | } |
|---|
| | 943 | |
|---|
| | 944 | static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
|---|
| | 945 | { |
|---|
| | 946 | if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) |
|---|
| | 947 | h264_loop_filter_luma_altivec(pix, stride, alpha, beta, tc0); |
|---|
| | 948 | // h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0); |
|---|
| | 949 | } |
|---|
| | 950 | |
|---|
| | 951 | static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
|---|
| | 952 | { |
|---|
| | 953 | //FIXME: could cut some load/stores by merging transpose with filter |
|---|
| | 954 | // also, it only needs to transpose 6x8 |
|---|
| | 955 | unsigned char trans[16*8] __attribute__((aligned(16))); |
|---|
| | 956 | if((tc0[0] & tc0[1] & tc0[2] * tc0[3]) < 0) |
|---|
| | 957 | return; |
|---|
| | 958 | transpose8x16(trans, pix-4, 16, stride); |
|---|
| | 959 | h264_loop_filter_luma_altivec(trans+4*16, 16, alpha, beta, tc0); |
|---|
| | 960 | // h264_loop_filter_luma_c(trans+4*16, 16, 1, alpha, beta, tc0); |
|---|
| | 961 | transpose4x4(pix-2, trans +2*16, stride, 16); |
|---|
| | 962 | transpose4x4(pix-2+4*stride, trans+4 +2*16, stride, 16); |
|---|
| | 963 | transpose4x4(pix-2+8*stride, trans+8 +2*16, stride, 16); |
|---|
| | 964 | transpose4x4(pix-2+12*stride, trans+12+2*16, stride, 16); |
|---|
| | 965 | } |
|---|
| | 966 | |
|---|