| | 407 | #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ |
|---|
| | 408 | /* 1st stage */ \ |
|---|
| | 409 | vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ |
|---|
| | 410 | vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ |
|---|
| | 411 | vz2 = vec_sra(vb1,vec_splat_u16(1)); \ |
|---|
| | 412 | vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ |
|---|
| | 413 | vz3 = vec_sra(vb3,vec_splat_u16(1)); \ |
|---|
| | 414 | vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ |
|---|
| | 415 | /* 2nd stage: output */ \ |
|---|
| | 416 | va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ |
|---|
| | 417 | va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ |
|---|
| | 418 | va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ |
|---|
| | 419 | va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ |
|---|
| | 420 | |
|---|
| | 421 | #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ |
|---|
| | 422 | b0 = vec_mergeh( a0, a0 ); \ |
|---|
| | 423 | b1 = vec_mergeh( a1, a0 ); \ |
|---|
| | 424 | b2 = vec_mergeh( a2, a0 ); \ |
|---|
| | 425 | b3 = vec_mergeh( a3, a0 ); \ |
|---|
| | 426 | a0 = vec_mergeh( b0, b2 ); \ |
|---|
| | 427 | a1 = vec_mergel( b0, b2 ); \ |
|---|
| | 428 | a2 = vec_mergeh( b1, b3 ); \ |
|---|
| | 429 | a3 = vec_mergel( b1, b3 ); \ |
|---|
| | 430 | b0 = vec_mergeh( a0, a2 ); \ |
|---|
| | 431 | b1 = vec_mergel( a0, a2 ); \ |
|---|
| | 432 | b2 = vec_mergeh( a1, a3 ); \ |
|---|
| | 433 | b3 = vec_mergel( a1, a3 ) |
|---|
| | 434 | |
|---|
| | 435 | #define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,mask,va,perm)\ |
|---|
| | 436 | vdst_orig = vec_ld(0,p); \ |
|---|
| | 437 | vdst = vec_perm(vdst_orig, (vec_u8_t)vzero, mask); \ |
|---|
| | 438 | vdst_ss = (vec_s16_t) vec_mergeh((vec_u8_t)vzero, vdst); \ |
|---|
| | 439 | va = vec_add(va,vdst_ss); \ |
|---|
| | 440 | va_u8 = vec_packsu(va,(vec_s16_t) vzero); \ |
|---|
| | 441 | vfdst = vec_perm(vdst_orig, va_u8, perm); \ |
|---|
| | 442 | vec_st(vfdst, 0, dst); |
|---|
| | 443 | |
|---|
| | 444 | #define VEC_LOAD_U8_ADD_S16_STORE_U8(p,va,perm) \ |
|---|
| | 445 | vdst = vec_ld(0, p); \ |
|---|
| | 446 | vdst_ss = (vec_s16_t)vec_mergeh((vec_u8_t)vzero, vdst); \ |
|---|
| | 447 | va = vec_add(va,vdst_ss); \ |
|---|
| | 448 | va_u8 = vec_packsu(va,(vec_s16_t) vzero); \ |
|---|
| | 449 | vfdst = vec_perm(vdst, va_u8, perm); \ |
|---|
| | 450 | vec_st(vfdst, 0, dst); |
|---|
| | 451 | |
|---|
| | 452 | static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) |
|---|
| | 453 | { |
|---|
| | 454 | vec_s16_t va0, va1, va2, va3; |
|---|
| | 455 | vec_s16_t vz0, vz1, vz2, vz3; |
|---|
| | 456 | vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; |
|---|
| | 457 | vec_u8_t va_u8; |
|---|
| | 458 | vec_s16_t vdst_ss; |
|---|
| | 459 | vec_u8_t vmaskl; |
|---|
| | 460 | vec_s16_t vload1, vload2, vload3; |
|---|
| | 461 | const vec_u16_t v6us = vec_splat_u16(6); |
|---|
| | 462 | const vec_s32_t vzero = vec_splat_s32(0); |
|---|
| | 463 | const int block_stride = 8; |
|---|
| | 464 | vec_u8_t dstperm; |
|---|
| | 465 | vec_u8_t vdst, vdst_orig, vfdst; |
|---|
| | 466 | |
|---|
| | 467 | block[0] += 32; /* add 32 as a DC-level for rounding */ |
|---|
| | 468 | |
|---|
| | 469 | vload1 = vec_ld(0,block); |
|---|
| | 470 | vload2 = vec_ld(16,block); |
|---|
| | 471 | vload3 = (vec_s16_t) vzero; |
|---|
| | 472 | vmaskl = vec_lvsl(block_stride,block); |
|---|
| | 473 | va0 = vload1; |
|---|
| | 474 | va1 = vec_perm(vload1,vload2,vmaskl); |
|---|
| | 475 | va2 = vload2; |
|---|
| | 476 | va3 = vec_perm(vload2,vload3,vmaskl); |
|---|
| | 477 | |
|---|
| | 478 | // VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); |
|---|
| | 479 | vtmp0 = va0; |
|---|
| | 480 | vtmp1 = va1; |
|---|
| | 481 | vtmp2 = va2; |
|---|
| | 482 | vtmp3 = va3; |
|---|
| | 483 | VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); |
|---|
| | 484 | VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); |
|---|
| | 485 | VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); |
|---|
| | 486 | |
|---|
| | 487 | va0 = vec_sra(va0,v6us); |
|---|
| | 488 | va1 = vec_sra(va1,v6us); |
|---|
| | 489 | va2 = vec_sra(va2,v6us); |
|---|
| | 490 | va3 = vec_sra(va3,v6us); |
|---|
| | 491 | |
|---|
| | 492 | if ((unsigned long)dst & 0xF){ |
|---|
| | 493 | vec_u8_t vdst_mask; |
|---|
| | 494 | switch ((unsigned long)dst & 0xF){ |
|---|
| | 495 | case 0: |
|---|
| | 496 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 497 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 498 | break; |
|---|
| | 499 | case 4: |
|---|
| | 500 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, |
|---|
| | 501 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 502 | break; |
|---|
| | 503 | case 8: |
|---|
| | 504 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 505 | 0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 506 | break; |
|---|
| | 507 | case 12: |
|---|
| | 508 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 509 | 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13); |
|---|
| | 510 | break; |
|---|
| | 511 | default: |
|---|
| | 512 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 513 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 514 | break; |
|---|
| | 515 | } |
|---|
| | 516 | |
|---|
| | 517 | vdst_mask = vec_lvsl(0, dst); |
|---|
| | 518 | |
|---|
| | 519 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va0,dstperm); |
|---|
| | 520 | dst += stride; |
|---|
| | 521 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va1,dstperm); |
|---|
| | 522 | dst += stride; |
|---|
| | 523 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va2,dstperm); |
|---|
| | 524 | dst += stride; |
|---|
| | 525 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va3,dstperm); |
|---|
| | 526 | } |
|---|
| | 527 | else{ |
|---|
| | 528 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 529 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 530 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va0,dstperm); |
|---|
| | 531 | dst += stride; |
|---|
| | 532 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va1,dstperm); |
|---|
| | 533 | dst += stride; |
|---|
| | 534 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va2,dstperm); |
|---|
| | 535 | dst += stride; |
|---|
| | 536 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va3,dstperm); |
|---|
| | 537 | } |
|---|
| | 538 | } |
|---|
| | 539 | |
|---|
| | 540 | static void ff_h264_idct_add_altivec_mat(uint8_t *dst, DCTELEM *block, int stride) |
|---|
| | 541 | { |
|---|
| | 542 | vec_s16_t va0, va1, va2, va3; |
|---|
| | 543 | vec_s32_t va0_sw, va1_sw, va2_sw, va3_sw; |
|---|
| | 544 | vec_s16_t vz0, vz1; |
|---|
| | 545 | vec_u8_t va_u8; |
|---|
| | 546 | vec_s16_t vdst_ss; |
|---|
| | 547 | vec_s16_t vload1, vload2; |
|---|
| | 548 | const vec_u16_t v8us = vec_splat_u16(8); |
|---|
| | 549 | const vec_s32_t vzero = vec_splat_s32(0); |
|---|
| | 550 | vec_u8_t dstperm; |
|---|
| | 551 | vec_u8_t vdst, vdst_orig, vfdst; |
|---|
| | 552 | vec_s16_t vy0_a, vy0_b,vy1_a, vy1_b; |
|---|
| | 553 | vec_s32_t vz0_a, vz0_b, vz1_a, vz1_b; |
|---|
| | 554 | vec_s32_t v32_sw = (vec_s32_t) AVV(128, 128, 128, 128); /* 1<<(shift-1) */ |
|---|
| | 555 | |
|---|
| | 556 | vec_u8_t vperm1 = (vec_u8_t) AVV(0x00,0x01,0x02,0x03,0x08,0x09,0x0A,0x0B,0x04,0x05,0x06,0x07,0x0C,0x0D,0x0E,0x0F); |
|---|
| | 557 | vec_u8_t vperm2 = (vec_u8_t) AVV(0x04,0x05,0x06,0x07,0x0C,0x0D,0x0E,0x0F,0x00,0x01,0x02,0x03,0x08,0x09,0x0A,0x0B); |
|---|
| | 558 | |
|---|
| | 559 | vec_s16_t vm0 = (vec_s16_t) AVV( 2, 2, 2, 2, -2, -2, -2, -2); |
|---|
| | 560 | /* 2*(a00, a10, a00, a10, a21, a31, a21, a31) */ |
|---|
| | 561 | vec_s16_t vm1 = (vec_s16_t) AVV( 2, 1, 2, 1, 2, 1, 2, 1); |
|---|
| | 562 | /* 2*(a20, a30, a20, a30, a01, a11, a01, a11) */ |
|---|
| | 563 | vec_s16_t vm2 = (vec_s16_t) AVV( 2, -1, 2, -1, 2, -1, 2, -1); |
|---|
| | 564 | /* 2*(a02, a12, a02, a12, a23, a33, a23, a33) */ |
|---|
| | 565 | vec_s16_t vm3 = (vec_s16_t) AVV(-2, 1, -2, 1, 2, -2, 2, -2); |
|---|
| | 566 | /* 2*(a22, a32, a22, a32, a03, a13, a03, a13) */ |
|---|
| | 567 | |
|---|
| | 568 | vec_s16_t vn0 = vec_splat_s16(2); |
|---|
| | 569 | /* 2*(b00, b01, b00, b01, b00, b01, b00, b01) */ |
|---|
| | 570 | vec_s16_t vn1 = (vec_s16_t) AVV( 2, 1, 2, 1, 2, 1, 2, 1); |
|---|
| | 571 | /* 2*(b02, b03, b02, b03, b02, b03, b02, b03) */ |
|---|
| | 572 | vec_s16_t vn2 = (vec_s16_t) AVV( 2, 1, 2, 1, 2, 1, 2, 1); |
|---|
| | 573 | /* 2*(b10, b11, b10, b11, b10, b11, b10, b11) */ |
|---|
| | 574 | vec_s16_t vn3 = (vec_s16_t) AVV(-2,- 2, -2, -2, -2, -2, -2, -2); |
|---|
| | 575 | /* 2*(b12, b13, b12, b13, b12, b13, b12, b13) */ |
|---|
| | 576 | vec_s16_t vn4 = (vec_s16_t) AVV( 2, -1, 2, -1, 2, -1, 2, -1); |
|---|
| | 577 | /* 2*(b20, b21, b20, b21, b20, b21, b20, b21) */ |
|---|
| | 578 | vec_s16_t vn5 = (vec_s16_t) AVV(-2, 2, -2, 2, -2, 2, -2, 2); |
|---|
| | 579 | /* 2*(b22, b32, b22, b32, b22, b32, b22, b32) */ |
|---|
| | 580 | vec_s16_t vn6 = (vec_s16_t) AVV( 2, -2, 2, -2, 2, -2, 2, -2); |
|---|
| | 581 | /* 2*(b30, b31, b30, b31, b30, b31, b30, b31) */ |
|---|
| | 582 | vec_s16_t vn7 = (vec_s16_t) AVV( 2, -1, 2, -1, 2, -1, 2, -1); |
|---|
| | 583 | /* 2*(b32, b33, b32, b33, b32, b33, b32, b33) */ |
|---|
| | 584 | |
|---|
| | 585 | vload1 = vec_ld(0,block); |
|---|
| | 586 | vload2 = vec_ld(16,block); |
|---|
| | 587 | |
|---|
| | 588 | vy0_a = vec_perm(vload1,vload2,vperm1); |
|---|
| | 589 | vy0_b = vec_perm(vload1,vload2,vperm2); |
|---|
| | 590 | vy1_a = vec_perm(vload2,vload1,vperm1); |
|---|
| | 591 | vy1_b = vec_perm(vload2,vload1,vperm2); |
|---|
| | 592 | |
|---|
| | 593 | vz0_a = vec_msum(vy0_a,vm0,vzero); |
|---|
| | 594 | vz0_a = vec_msum(vy0_b,vm1,vz0_a); /* z00, z10, z01, z11 */ |
|---|
| | 595 | vz0_b = vec_msum(vy0_a,vm2,vzero); |
|---|
| | 596 | vz0_b = vec_msum(vy0_b,vm3,vz0_b); /* z02, z12, z03, z13 */ |
|---|
| | 597 | vz0 = vec_pack(vz0_a, vz0_b); /* z00, z10, z01, z11 z02, z12, z03, z13 */ |
|---|
| | 598 | |
|---|
| | 599 | vz1_a = vec_msum(vy1_a,vm0,vzero); |
|---|
| | 600 | vz1_a = vec_msum(vy1_b,vm1,vz1_a); /* z20, z30, z21, z31 */ |
|---|
| | 601 | vz1_b = vec_msum(vy1_a,vm2,vzero); |
|---|
| | 602 | vz1_b = vec_msum(vy1_b,vm3,vz1_b); /* z22, z32, z23, z33 */ |
|---|
| | 603 | vz1 = vec_pack(vz1_a, vz1_b); /* z20, z30, z21, z31 z22, z32, z23, z33 */ |
|---|
| | 604 | |
|---|
| | 605 | va0_sw = vec_msum(vz0,vn0,v32_sw); |
|---|
| | 606 | va0_sw = vec_msum(vz1,vn1,va0_sw); /* x00, x01, x02, x03 */ |
|---|
| | 607 | va1_sw = vec_msum(vz0,vn2,v32_sw); |
|---|
| | 608 | va1_sw = vec_msum(vz1,vn3,va1_sw); /* x10, x11, x12, x13 */ |
|---|
| | 609 | va2_sw = vec_msum(vz0,vn4,v32_sw); |
|---|
| | 610 | va2_sw = vec_msum(vz1,vn5,va2_sw); /* x20, x21, x22, x23 */ |
|---|
| | 611 | va3_sw = vec_msum(vz0,vn6,v32_sw); |
|---|
| | 612 | va3_sw = vec_msum(vz1,vn7,va3_sw); /* x30, x31, x32, x33 */ |
|---|
| | 613 | |
|---|
| | 614 | va0 = vec_pack(va0_sw,vzero); /* x00, x01, x02, x03, 00, 00, 00, 00 */ |
|---|
| | 615 | va1 = vec_pack(va1_sw,vzero); /* x10, x11, x12, x13, 00, 00, 00, 00 */ |
|---|
| | 616 | va2 = vec_pack(va2_sw,vzero); /* x20, x21, x22, x23, 00, 00, 00, 00 */ |
|---|
| | 617 | va3 = vec_pack(va3_sw,vzero); /* x30, x31, x32, x33, 00, 00, 00, 00 */ |
|---|
| | 618 | |
|---|
| | 619 | va0 = vec_sra(va0,v8us); /* 6 from the rounding shift +2 for matrix scaling */ |
|---|
| | 620 | va1 = vec_sra(va1,v8us); |
|---|
| | 621 | va2 = vec_sra(va2,v8us); |
|---|
| | 622 | va3 = vec_sra(va3,v8us); |
|---|
| | 623 | |
|---|
| | 624 | if ((unsigned long)dst & 0xF){ |
|---|
| | 625 | vec_u8_t vdst_mask; |
|---|
| | 626 | switch ((unsigned long)dst & 0xF){ |
|---|
| | 627 | case 0: |
|---|
| | 628 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 629 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 630 | break; |
|---|
| | 631 | case 4: |
|---|
| | 632 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, |
|---|
| | 633 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 634 | break; |
|---|
| | 635 | case 8: |
|---|
| | 636 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 637 | 0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 638 | break; |
|---|
| | 639 | case 12: |
|---|
| | 640 | dstperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 641 | 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13); |
|---|
| | 642 | break; |
|---|
| | 643 | default: |
|---|
| | 644 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 645 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 646 | break; |
|---|
| | 647 | } |
|---|
| | 648 | |
|---|
| | 649 | vdst_mask = vec_lvsl(0, dst); |
|---|
| | 650 | |
|---|
| | 651 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va0,dstperm); |
|---|
| | 652 | dst += stride; |
|---|
| | 653 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va1,dstperm); |
|---|
| | 654 | dst += stride; |
|---|
| | 655 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va2,dstperm); |
|---|
| | 656 | dst += stride; |
|---|
| | 657 | VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,vdst_mask,va3,dstperm); |
|---|
| | 658 | } |
|---|
| | 659 | else{ |
|---|
| | 660 | dstperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, |
|---|
| | 661 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); |
|---|
| | 662 | |
|---|
| | 663 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va0,dstperm); |
|---|
| | 664 | dst += stride; |
|---|
| | 665 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va1,dstperm); |
|---|
| | 666 | dst += stride; |
|---|
| | 667 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va2,dstperm); |
|---|
| | 668 | dst += stride; |
|---|
| | 669 | VEC_LOAD_U8_ADD_S16_STORE_U8(dst,va3,dstperm); |
|---|
| | 670 | } |
|---|
| | 671 | } |
|---|
| | 672 | |
|---|