| | 527 | /* A routine to read an unaligned vector. Thanks for the example code Apple */ |
| | 528 | static inline vector unsigned char read_unaligned(int offset, uint8_t *src) |
| | 529 | { |
| | 530 | register vector unsigned char first = vec_ld(offset, src); |
| | 531 | register vector unsigned char second = vec_ld(offset+15, src); |
| | 532 | register vector unsigned char mask = vec_lvsl(offset, src); |
| | 533 | return vec_perm(first, second, mask); |
| | 534 | } |
| | 535 | |
| | 536 | #define transpose4x16(r0, r1, r2, r3){\ |
| | 537 | register vector unsigned char r4;\ |
| | 538 | register vector unsigned char r5;\ |
| | 539 | register vector unsigned char r6;\ |
| | 540 | register vector unsigned char r7;\ |
| | 541 | \ |
| | 542 | r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/\ |
| | 543 | r5 = vec_mergel(r0, r2); /*0, 2 set 1*/\ |
| | 544 | r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/\ |
| | 545 | r7 = vec_mergel(r1, r3); /*1, 3 set 1*/\ |
| | 546 | \ |
| | 547 | r0 = vec_mergeh(r4, r6); /*all set 0*/\ |
| | 548 | r1 = vec_mergel(r4, r6); /*all set 1*/\ |
| | 549 | r2 = vec_mergeh(r5, r7); /*all set 2*/\ |
| | 550 | r3 = vec_mergel(r5, r7); /*all set 3*/\ |
| | 551 | } |
| | 552 | |
| | 553 | static inline void write16x4(uint8_t *dst, int dst_stride, register vector unsigned char r0, register vector unsigned char r1, register vector unsigned char r2, register vector unsigned char r3){ |
| | 554 | unsigned char result[64] __attribute__((aligned(16))); |
| | 555 | uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; |
| | 556 | int int_dst_stride = dst_stride/4; |
| | 557 | |
| | 558 | vec_st(r0, 0, result); |
| | 559 | vec_st(r1, 16, result); |
| | 560 | vec_st(r2, 32, result); |
| | 561 | vec_st(r3, 48, result); |
| | 562 | /* there has to be a better way!!!! */ |
| | 563 | *dst_int = *src_int; |
| | 564 | *(dst_int+int_dst_stride) = *(src_int + 1); |
| | 565 | *(dst_int+2*int_dst_stride) = *(src_int + 2); |
| | 566 | *(dst_int+3*int_dst_stride) = *(src_int + 3); |
| | 567 | *(dst_int+4*int_dst_stride) = *(src_int + 4); |
| | 568 | *(dst_int+5*int_dst_stride) = *(src_int + 5); |
| | 569 | *(dst_int+6*int_dst_stride) = *(src_int + 6); |
| | 570 | *(dst_int+7*int_dst_stride) = *(src_int + 7); |
| | 571 | *(dst_int+8*int_dst_stride) = *(src_int + 8); |
| | 572 | *(dst_int+9*int_dst_stride) = *(src_int + 9); |
| | 573 | *(dst_int+10*int_dst_stride) = *(src_int + 10); |
| | 574 | *(dst_int+11*int_dst_stride) = *(src_int + 11); |
| | 575 | *(dst_int+12*int_dst_stride) = *(src_int + 12); |
| | 576 | *(dst_int+13*int_dst_stride) = *(src_int + 13); |
| | 577 | *(dst_int+14*int_dst_stride) = *(src_int + 14); |
| | 578 | *(dst_int+15*int_dst_stride) = *(src_int + 15); |
| | 579 | } |
| | 580 | |
| | 581 | /* This function does an 6x16 transpose on data in src, and stores it in dst */ |
| | 582 | #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13){\ |
| | 583 | register vector unsigned char r0 = read_unaligned(0, src);\ |
| | 584 | register vector unsigned char r1 = read_unaligned(src_stride, src);\ |
| | 585 | register vector unsigned char r2 = read_unaligned(2*src_stride, src);\ |
| | 586 | register vector unsigned char r3 = read_unaligned(3*src_stride, src);\ |
| | 587 | register vector unsigned char r4 = read_unaligned(4*src_stride, src);\ |
| | 588 | register vector unsigned char r5 = read_unaligned(5*src_stride, src);\ |
| | 589 | register vector unsigned char r6 = read_unaligned(6*src_stride, src);\ |
| | 590 | register vector unsigned char r7 = read_unaligned(7*src_stride, src);\ |
| | 591 | register vector unsigned char r14 = read_unaligned(14*src_stride, src);\ |
| | 592 | register vector unsigned char r15 = read_unaligned(15*src_stride, src);\ |
| | 593 | \ |
| | 594 | r8 = read_unaligned(8*src_stride, src);\ |
| | 595 | r9 = read_unaligned(9*src_stride, src);\ |
| | 596 | r10 = read_unaligned(10*src_stride, src);\ |
| | 597 | r11 = read_unaligned(11*src_stride, src);\ |
| | 598 | r12 = read_unaligned(12*src_stride, src);\ |
| | 599 | r13 = read_unaligned(13*src_stride, src);\ |
| | 600 | \ |
| | 601 | /*Merge first pairs*/ \ |
| | 602 | r0 = vec_mergeh(r0, r8); /*0,8*/\ |
| | 603 | r1 = vec_mergeh(r1, r9); /*1,9*/\ |
| | 604 | r2 = vec_mergeh(r2, r10); /*2,10*/\ |
| | 605 | r3 = vec_mergeh(r3, r11); /*3,11*/\ |
| | 606 | r4 = vec_mergeh(r4, r12); /*4,12*/\ |
| | 607 | r5 = vec_mergeh(r5, r13); /*5,13*/\ |
| | 608 | r6 = vec_mergeh(r6, r14); /*6,14*/\ |
| | 609 | r7 = vec_mergeh(r7, r15); /*7,15*/\ |
| | 610 | \ |
| | 611 | /*Merge second pairs*/\ |
| | 612 | r8 = vec_mergeh(r0, r4); /*0,4,8,12 set 0*/\ |
| | 613 | r9 = vec_mergel(r0, r4); /*0,4,8,12 set 1*/\ |
| | 614 | r10 = vec_mergeh(r1, r5); /*1,5,9,13 set 0*/\ |
| | 615 | r11 = vec_mergel(r1, r5); /*1,5,9,13 set 1*/\ |
| | 616 | r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/\ |
| | 617 | r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/\ |
| | 618 | r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/\ |
| | 619 | r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/\ |
| | 620 | \ |
| | 621 | /*Third merge*/\ |
| | 622 | r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/\ |
| | 623 | r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/\ |
| | 624 | r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/\ |
| | 625 | r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/\ |
| | 626 | r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/\ |
| | 627 | r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/\ |
| | 628 | /* Don't need to compute 3 and 7*/\ |
| | 629 | \ |
| | 630 | /*Final merge*/\ |
| | 631 | r8 = vec_mergeh(r0, r4); /*all set 0*/\ |
| | 632 | r9 = vec_mergel(r0, r4); /*all set 1*/\ |
| | 633 | r10 = vec_mergeh(r1, r5); /*all set 2*/\ |
| | 634 | r11 = vec_mergel(r1, r5); /*all set 3*/\ |
| | 635 | r12 = vec_mergeh(r2, r6); /*all set 4*/\ |
| | 636 | r13 = vec_mergel(r2, r6); /*all set 5*/\ |
| | 637 | /* Don't need to compute 14 and 15*/\ |
| | 638 | \ |
| | 639 | } |
| | 640 | |
| | 641 | // out: o = |x-y| < a |
| | 642 | static inline vector unsigned char diff_lt_altivec( |
| | 643 | register vector unsigned char x, |
| | 644 | register vector unsigned char y, |
| | 645 | register vector unsigned char a) { |
| | 646 | |
| | 647 | register vector unsigned char diff = vec_subs(x, y); |
| | 648 | register vector unsigned char diffneg = vec_subs(y, x); |
| | 649 | register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ |
| | 650 | o = vec_cmplt(o, a); |
| | 651 | return o; |
| | 652 | } |
| | 653 | |
| | 654 | static inline vector unsigned char h264_deblock_mask( |
| | 655 | register vector unsigned char p0, |
| | 656 | register vector unsigned char p1, |
| | 657 | register vector unsigned char q0, |
| | 658 | register vector unsigned char q1, |
| | 659 | register vector unsigned char alpha, |
| | 660 | register vector unsigned char beta) { |
| | 661 | |
| | 662 | register vector unsigned char mask; |
| | 663 | register vector unsigned char tempmask; |
| | 664 | |
| | 665 | mask = diff_lt_altivec(p0, q0, alpha); |
| | 666 | tempmask = diff_lt_altivec(p1, p0, beta); |
| | 667 | mask = vec_and(mask, tempmask); |
| | 668 | tempmask = diff_lt_altivec(q1, q0, beta); |
| | 669 | mask = vec_and(mask, tempmask); |
| | 670 | |
| | 671 | return mask; |
| | 672 | } |
| | 673 | |
| | 674 | // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) |
| | 675 | #define h264_deblock_q1(p0, p1, p2, q0, tc0){\ |
| | 676 | \ |
| | 677 | register vector unsigned char average = vec_avg(p0, q0);\ |
| | 678 | register vector unsigned char temp;\ |
| | 679 | register vector unsigned char uncliped;\ |
| | 680 | register vector unsigned char ones;\ |
| | 681 | register vector unsigned char max;\ |
| | 682 | register vector unsigned char min;\ |
| | 683 | \ |
| | 684 | temp = vec_xor(average, p2);\ |
| | 685 | average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */\ |
| | 686 | ones = vec_splat_u8(1);\ |
| | 687 | temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */\ |
| | 688 | uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\ |
| | 689 | max = vec_adds(p1, tc0);\ |
| | 690 | min = vec_subs(p1, tc0);\ |
| | 691 | p1 = vec_max(min, uncliped);\ |
| | 692 | p1 = vec_min(max, p1);\ |
| | 693 | } |
| | 694 | |
| | 695 | #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked){\ |
| | 696 | \ |
| | 697 | const vec_u8_t A1v = (vec_u8_t) AVV(0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,\ |
| | 698 | 0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1,0xA1);\ |
| | 699 | \ |
| | 700 | register vector unsigned char p0q0xorbit = vec_xor(p0, q0);\ |
| | 701 | register vector unsigned char temp;\ |
| | 702 | register vector unsigned char q1minus;\ |
| | 703 | register vector unsigned char p0minus;\ |
| | 704 | register vector unsigned char stage1;\ |
| | 705 | register vector unsigned char stage2;\ |
| | 706 | register vector unsigned char vec161;\ |
| | 707 | register vector unsigned char delta;\ |
| | 708 | register vector unsigned char deltaneg;\ |
| | 709 | \ |
| | 710 | p0q0xorbit = vec_and(p0q0xorbit, vec_splat_u8(1)); /* (p0^q0)&1 */\ |
| | 711 | temp = vec_cmpeq(p0, p0);\ |
| | 712 | q1minus = vec_xor(temp, q1); /* 255 - q1 */\ |
| | 713 | stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */\ |
| | 714 | stage2 = vec_avg(vec_splat_u8(3), stage1); /* (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 */\ |
| | 715 | p0minus = vec_xor(temp, p0); /* 255 - p0 */\ |
| | 716 | stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */\ |
| | 717 | stage2 = vec_avg(stage2, p0q0xorbit); /* 33+(p1-q1)>>3 */\ |
| | 718 | stage2 = vec_adds(stage2, stage1); /* d+128 + 33 */\ |
| | 719 | vec161 = vec_ld(0, &A1v);\ |
| | 720 | deltaneg = vec_subs(vec161, stage2); /* -d */\ |
| | 721 | delta = vec_subs(stage2, vec161); /* d */\ |
| | 722 | deltaneg = vec_min(tc0masked, deltaneg);\ |
| | 723 | delta = vec_min(tc0masked, delta);\ |
| | 724 | p0 = vec_subs(p0, deltaneg);\ |
| | 725 | q0 = vec_subs(q0, delta);\ |
| | 726 | p0 = vec_adds(p0, delta);\ |
| | 727 | q0 = vec_adds(q0, deltaneg);\ |
| | 728 | } |
| | 729 | |
| | 730 | #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0){\ |
| | 731 | unsigned char temp[16] __attribute__((aligned(16)));\ |
| | 732 | register vector unsigned char alphavec;\ |
| | 733 | register vector unsigned char betavec;\ |
| | 734 | register vector unsigned char mask;\ |
| | 735 | register vector unsigned char p1mask;\ |
| | 736 | register vector unsigned char q1mask;\ |
| | 737 | register vector char tc0vec;\ |
| | 738 | register vector unsigned char finaltc0;\ |
| | 739 | register vector unsigned char tc0masked;\ |
| | 740 | \ |
| | 741 | temp[0] = alpha;\ |
| | 742 | temp[1] = beta;\ |
| | 743 | alphavec = vec_ld(0, temp);\ |
| | 744 | betavec = vec_splat(alphavec, 0x1);\ |
| | 745 | alphavec = vec_splat(alphavec, 0x0);\ |
| | 746 | mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */\ |
| | 747 | \ |
| | 748 | *((int *)temp) = *((int *)tc0);\ |
| | 749 | tc0vec = vec_ld(0, temp);\ |
| | 750 | tc0vec = vec_mergeh(tc0vec, tc0vec);\ |
| | 751 | tc0vec = vec_mergeh(tc0vec, tc0vec);\ |
| | 752 | mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */\ |
| | 753 | finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/\ |
| | 754 | \ |
| | 755 | p1mask = diff_lt_altivec(p2, p0, betavec);\ |
| | 756 | p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */\ |
| | 757 | tc0masked = vec_and(p1mask, tc0vec);\ |
| | 758 | finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */\ |
| | 759 | h264_deblock_q1(p0, p1, p2, q0, tc0masked);\ |
| | 760 | /*end if*/\ |
| | 761 | \ |
| | 762 | q1mask = diff_lt_altivec(q2, q0, betavec);\ |
| | 763 | q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ |
| | 764 | tc0masked = vec_and(q1mask, tc0vec);\ |
| | 765 | finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */\ |
| | 766 | h264_deblock_q1(p0, q1, q2, q0, tc0masked);\ |
| | 767 | /*end if*/\ |
| | 768 | \ |
| | 769 | h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);\ |
| | 770 | } |
| | 771 | |
| | 772 | static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
| | 773 | { |
| | 774 | if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) |
| | 775 | { |
| | 776 | register vector unsigned char p2 = vec_ld(-3*stride, pix); |
| | 777 | register vector unsigned char p1 = vec_ld(-2*stride, pix); |
| | 778 | register vector unsigned char p0 = vec_ld(-1*stride, pix); |
| | 779 | register vector unsigned char q0 = vec_ld(0, pix); |
| | 780 | register vector unsigned char q1 = vec_ld(stride, pix); |
| | 781 | register vector unsigned char q2 = vec_ld(2*stride, pix); |
| | 782 | h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); |
| | 783 | vec_st(p1, -2*stride, pix); |
| | 784 | vec_st(p0, -1*stride, pix); |
| | 785 | vec_st(q0, 0, pix); |
| | 786 | vec_st(q1, stride, pix); |
| | 787 | } |
| | 788 | } |
| | 789 | |
| | 790 | static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) |
| | 791 | { |
| | 792 | register vector unsigned char line0, line1, line2, line3, line4, line5; |
| | 793 | if((tc0[0] & tc0[1] & tc0[2] * tc0[3]) < 0) |
| | 794 | return; |
| | 795 | readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); |
| | 796 | h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); |
| | 797 | transpose4x16(line1, line2, line3, line4); |
| | 798 | write16x4(pix-2, stride, line1, line2, line3, line4); |
| | 799 | } |
| | 800 | |