Initial community commit
This commit is contained in:
1002
Src/libvpShared/corelibs/cdxv/vputil/win32/fdct_m.asm
Normal file
1002
Src/libvpShared/corelibs/cdxv/vputil/win32/fdct_m.asm
Normal file
File diff suppressed because it is too large
Load Diff
1398
Src/libvpShared/corelibs/cdxv/vputil/win32/fdctmmx.c
Normal file
1398
Src/libvpShared/corelibs/cdxv/vputil/win32/fdctmmx.c
Normal file
File diff suppressed because it is too large
Load Diff
810
Src/libvpShared/corelibs/cdxv/vputil/win32/fdctwmt.c
Normal file
810
Src/libvpShared/corelibs/cdxv/vputil/win32/fdctwmt.c
Normal file
@@ -0,0 +1,810 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : Fdctwmt.c
|
||||
*
|
||||
* Description : Forward DCT optimized specifically for Intel P4
|
||||
* processor
|
||||
*
|
||||
* AUTHOR : YaoWu Xu
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
* 1.00 YWX 03/11/02 Configuration baseline
|
||||
*
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
/*******************************************************************************
|
||||
* Module Constants
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
__declspec(align(16)) static unsigned short TIRY[8];
|
||||
|
||||
__declspec(align(16)) static unsigned short WmtIdctConst[8 * 8] =
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
64277,64277,64277,64277,64277,64277,64277,64277,
|
||||
60547,60547,60547,60547,60547,60547,60547,60547,
|
||||
54491,54491,54491,54491,54491,54491,54491,54491,
|
||||
46341,46341,46341,46341,46341,46341,46341,46341,
|
||||
36410,36410,36410,36410,36410,36410,36410,36410,
|
||||
25080,25080,25080,25080,25080,25080,25080,25080,
|
||||
12785,12785,12785,12785,12785,12785,12785,12785
|
||||
};
|
||||
|
||||
|
||||
/**************************************************************************************
|
||||
*
|
||||
* Macro: FDct_WMT
|
||||
*
|
||||
* Description: The Macro does 1-D IDct on 8 columns.
|
||||
*
|
||||
* Input: None
|
||||
*
|
||||
* Output: None
|
||||
*
|
||||
* Return: None
|
||||
*
|
||||
* Special Note: None
|
||||
*
|
||||
* Error: None
|
||||
*
|
||||
***************************************************************************************
|
||||
*/
|
||||
void fdct_WMT(short *InputData, short *OutputData)
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
mov eax, InputData
|
||||
mov ebx, OutputData
|
||||
lea edx, WmtIdctConst
|
||||
|
||||
#define I(i) [eax + 16 * i ]
|
||||
#define O(i) [ebx + 16 * i ]
|
||||
#define C(i) [edx + 16 * i ]
|
||||
|
||||
/******************************************************/
|
||||
/* Do 8x8 Transpose */
|
||||
/******************************************************/
|
||||
|
||||
movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
|
||||
movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
|
||||
|
||||
psllw xmm4, 1
|
||||
psllw xmm0, 1
|
||||
|
||||
movdqa xmm5, xmm4 /* make a copy */
|
||||
punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
|
||||
|
||||
punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
|
||||
movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
|
||||
|
||||
movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
|
||||
|
||||
psllw xmm6, 1
|
||||
psllw xmm0, 1
|
||||
|
||||
movdqa xmm7, xmm6 /* make a copy */
|
||||
|
||||
punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
|
||||
punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
|
||||
|
||||
movdqa xmm3, xmm4 /* make a copy */
|
||||
punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
|
||||
|
||||
punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
|
||||
movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
|
||||
/* Free xmm6 */
|
||||
movdqa xmm6, xmm5 /* make a copy */
|
||||
punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
|
||||
|
||||
punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
|
||||
movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
|
||||
/* Free xmm7 */
|
||||
movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
|
||||
|
||||
psllw xmm0, 1
|
||||
psllw xmm1, 1
|
||||
|
||||
movdqa xmm7, xmm0 /* make a copy */
|
||||
|
||||
punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
|
||||
punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
|
||||
/* Free xmm1 */
|
||||
movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
|
||||
movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
|
||||
|
||||
psllw xmm2, 1
|
||||
psllw xmm3, 1
|
||||
|
||||
movdqa xmm1, xmm2 /* make a copy */
|
||||
punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
|
||||
|
||||
punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
|
||||
movdqa xmm3, xmm0 /* make a copy */
|
||||
|
||||
punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
|
||||
punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
|
||||
/* Free xmm2 */
|
||||
movdqa xmm2, xmm7 /* make a copy */
|
||||
punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
|
||||
|
||||
punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
|
||||
movdqa xmm1, xmm0 /* make a copy */
|
||||
|
||||
punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
|
||||
punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
|
||||
|
||||
movdqa I(0), xmm0 /* save I(0) */
|
||||
movdqa I(1), xmm1 /* save I(1) */
|
||||
|
||||
movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
|
||||
movdqa xmm1, xmm3 /* make a copy */
|
||||
|
||||
punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
|
||||
punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
|
||||
|
||||
movdqa xmm4, xmm2 /* make a copy */
|
||||
punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
|
||||
|
||||
punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
|
||||
movdqa I(2), xmm1 /* save I(2) */
|
||||
|
||||
movdqa I(3), xmm3 /* save I(3) */
|
||||
movdqa I(4), xmm4 /* save I(4) */
|
||||
|
||||
movdqa I(5), xmm2 /* save I(5) */
|
||||
movdqa xmm5, xmm7 /* make a copy */
|
||||
|
||||
punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
|
||||
punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
|
||||
|
||||
movdqa I(6), xmm5 /* save I(6) */
|
||||
movdqa I(7), xmm7 /* save I(7) */
|
||||
|
||||
/******************************************************/
|
||||
/* Done with transpose - Let's do the forward DCT */
|
||||
/******************************************************/
|
||||
|
||||
movdqa xmm0, I(0) /* xmm0 = ip0 */
|
||||
movdqa xmm1, I(1) /* xmm1 = ip1 */
|
||||
|
||||
movdqa xmm2, I(3) /* xmm2 = ip3 */
|
||||
movdqa xmm3, I(5) /* xmm3 = ip5 */
|
||||
|
||||
movdqa xmm4, xmm0 /* xmm4 = ip0 */
|
||||
movdqa xmm5, xmm1 /* xmm5 = ip1 */
|
||||
|
||||
movdqa xmm6, xmm2 /* xmm6 = ip3 */
|
||||
movdqa xmm7, xmm3 /* xmm7 = ip5 */
|
||||
|
||||
paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
|
||||
paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
|
||||
|
||||
paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
|
||||
paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
|
||||
|
||||
psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
|
||||
psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
|
||||
|
||||
psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
|
||||
paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
|
||||
|
||||
psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
|
||||
paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
|
||||
|
||||
psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
|
||||
movdqa TIRY, xmm0 /* save is07-is34 */
|
||||
|
||||
paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
|
||||
paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
|
||||
|
||||
psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
|
||||
psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
|
||||
|
||||
paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
|
||||
paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
|
||||
/*---------------------------------------------------------*/
|
||||
/* op0 and op4
|
||||
/*---------------------------------------------------------*/
|
||||
psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
|
||||
paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
|
||||
|
||||
movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
|
||||
paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
|
||||
|
||||
pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
|
||||
paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
|
||||
|
||||
movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
|
||||
movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
|
||||
|
||||
movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
|
||||
pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
|
||||
|
||||
paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
|
||||
movdqa O(0), xmm3 /* save op0 */
|
||||
/*---------------------------------------------------------*/
|
||||
/* op2 and op6
|
||||
/*---------------------------------------------------------*/
|
||||
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
|
||||
pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
|
||||
|
||||
movdqa xmm2, TIRY /* xmm2 = irot_input_y */
|
||||
movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
|
||||
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
|
||||
|
||||
|
||||
movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
|
||||
pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm0, xmm2 /* Truncated */
|
||||
|
||||
paddsw xmm3, xmm0 /* op[2] */
|
||||
movdqa O(2), xmm3 /* save op[2] */
|
||||
|
||||
|
||||
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
|
||||
movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
|
||||
|
||||
pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
|
||||
psrlw xmm2, 15
|
||||
|
||||
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
|
||||
paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
|
||||
|
||||
paddw xmm5, xmm2 /* Truncated */
|
||||
movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
|
||||
|
||||
pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
|
||||
psrlw xmm2, 15
|
||||
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
psubsw xmm3, xmm5 /* xmm3 = op[6] */
|
||||
|
||||
movdqa O(6), xmm3
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* icommon_product1, icommon_product2 */
|
||||
/*-----------------------------------------------------------------------*/
|
||||
movdqa xmm0, C(4) /* xmm0 = xC4s4 */
|
||||
movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
|
||||
|
||||
movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
|
||||
pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
|
||||
|
||||
paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
|
||||
movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
|
||||
|
||||
movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
|
||||
pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
|
||||
|
||||
psrlw xmm2, 15 /* For trucation */
|
||||
paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
|
||||
|
||||
paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
|
||||
/*---------------------------------------------------------*/
|
||||
pxor xmm0, xmm0 /* Clear xmm0 */
|
||||
psubsw xmm0, xmm6 /* xmm0 = - id34 */
|
||||
|
||||
psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
|
||||
paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
|
||||
|
||||
paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
|
||||
psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
|
||||
|
||||
paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
|
||||
paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
|
||||
|
||||
/*---------------------------------------------------------*/
|
||||
/* op1 and op7
|
||||
/*---------------------------------------------------------*/
|
||||
|
||||
movdqa xmm7, C(1) /* xC1S7 */
|
||||
movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
|
||||
|
||||
movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
|
||||
pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
|
||||
|
||||
movdqa xmm7, C(7) /* xC7S1 */
|
||||
psrlw xmm2, 15 /* for trucation */
|
||||
|
||||
paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
|
||||
paddw xmm1, xmm2 /* Trucated */
|
||||
|
||||
pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
|
||||
movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
|
||||
movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
|
||||
|
||||
movdqa xmm7, C(1) /* xC1S7 */
|
||||
pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
|
||||
|
||||
movdqa xmm7, C(7) /* xC7S1 */
|
||||
psrlw xmm2, 15 /* for trucation */
|
||||
|
||||
paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
|
||||
paddw xmm0, xmm2 /* Truncated */
|
||||
|
||||
pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
|
||||
paddw xmm5, xmm2 /* Truncated */
|
||||
|
||||
psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
|
||||
paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
|
||||
|
||||
movdqa O(1), xmm1
|
||||
movdqa O(7), xmm3
|
||||
/*---------------------------------------------------------*/
|
||||
/* op3 and op5
|
||||
/*---------------------------------------------------------*/
|
||||
movdqa xmm0, C(3) /* xC3S5 */
|
||||
movdqa xmm1, C(5) /* xC5S3 */
|
||||
|
||||
movdqa xmm5,xmm6 /* irot_input_x */
|
||||
movdqa xmm7,xmm6 /* irot_input_x */
|
||||
|
||||
movdqa xmm2,xmm4 /* irot_input_y */
|
||||
movdqa xmm3,xmm4 /* irot_input_y */
|
||||
|
||||
pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
|
||||
pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
|
||||
|
||||
psrlw xmm2,15 /* for trucation */
|
||||
psrlw xmm5,15 /* for trucation */
|
||||
|
||||
paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
|
||||
paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
|
||||
|
||||
paddw xmm4,xmm2 /* Truncated */
|
||||
paddw xmm6,xmm5 /* Truncated */
|
||||
|
||||
psubsw xmm4,xmm6 /* op [3] */
|
||||
movdqa O(3),xmm4 /* Save Op[3] */
|
||||
|
||||
movdqa xmm4,xmm3 /* irot_input_y */
|
||||
movdqa xmm6,xmm7 /* irot_input_x */
|
||||
|
||||
pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
|
||||
pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
|
||||
|
||||
paddw xmm4,xmm2 /* Trucated */
|
||||
paddw xmm6,xmm5 /* Trucated */
|
||||
|
||||
paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
|
||||
paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
|
||||
|
||||
paddw xmm3,xmm7 /* Op[5] */
|
||||
movdqa O(5),xmm3 /* Save Op[5] */
|
||||
/*---------------------------------------------------------*/
|
||||
/* End of 8 1-D FDCT */
|
||||
/*---------------------------------------------------------*/
|
||||
#undef I
|
||||
#undef O
|
||||
#define I(i) [ebx + 16 * i ]
|
||||
#define O(i) [ebx + 16 * i ]
|
||||
|
||||
/******************************************************/
|
||||
/* Do 8x8 Transpose */
|
||||
/******************************************************/
|
||||
|
||||
movdqa xmm4, I(4) /* xmm4=e7e6e5e4e3e2e1e0 */
|
||||
movdqa xmm0, I(5) /* xmm4=f7f6f5f4f3f2f1f0 */
|
||||
|
||||
movdqa xmm5, xmm4 /* make a copy */
|
||||
punpcklwd xmm4, xmm0 /* xmm4=f3e3f2e2f1e1f0e0 */
|
||||
|
||||
punpckhwd xmm5, xmm0 /* xmm5=f7e7f6e6f5e5f4e4 */
|
||||
movdqa xmm6, I(6) /* xmm6=g7g6g5g4g3g2g1g0 */
|
||||
|
||||
movdqa xmm0, I(7) /* xmm0=h7h6h5h4h3h2h1h0 */
|
||||
movdqa xmm7, xmm6 /* make a copy */
|
||||
|
||||
punpcklwd xmm6, xmm0 /* xmm6=h3g3h3g2h1g1h0g0 */
|
||||
punpckhwd xmm7, xmm0 /* xmm7=h7g7h6g6h5g5h4g4 */
|
||||
|
||||
movdqa xmm3, xmm4 /* make a copy */
|
||||
punpckldq xmm4, xmm6 /* xmm4=h1g1f1e1h0g0f0e0 */
|
||||
|
||||
punpckhdq xmm3, xmm6 /* xmm3=h3g3g3e3h2g2f2e2 */
|
||||
movdqa I(6), xmm3 /* save h3g3g3e3h2g2f2e2 */
|
||||
/* Free xmm6 */
|
||||
movdqa xmm6, xmm5 /* make a copy */
|
||||
punpckldq xmm5, xmm7 /* xmm5=h5g5f5e5h4g4f4e4 */
|
||||
|
||||
punpckhdq xmm6, xmm7 /* xmm6=h7g7f7e7h6g6f6e6 */
|
||||
movdqa xmm0, I(0) /* xmm0=a7a6a5a4a3a2a1a0 */
|
||||
/* Free xmm7 */
|
||||
movdqa xmm1, I(1) /* xmm1=b7b6b5b4b3b2b1b0 */
|
||||
movdqa xmm7, xmm0 /* make a copy */
|
||||
|
||||
punpcklwd xmm0, xmm1 /* xmm0=b3a3b2a2b1a1b0a0 */
|
||||
punpckhwd xmm7, xmm1 /* xmm7=b7a7b6a6b5a5b4a4 */
|
||||
/* Free xmm1 */
|
||||
movdqa xmm2, I(2) /* xmm2=c7c6c5c4c3c2c1c0 */
|
||||
movdqa xmm3, I(3) /* xmm3=d7d6d5d4d3d2d1d0 */
|
||||
|
||||
movdqa xmm1, xmm2 /* make a copy */
|
||||
punpcklwd xmm2, xmm3 /* xmm2=d3c3d2c2d1c1d0c0 */
|
||||
|
||||
punpckhwd xmm1, xmm3 /* xmm1=d7c7d6c6d5c5d4c4 */
|
||||
movdqa xmm3, xmm0 /* make a copy */
|
||||
|
||||
punpckldq xmm0, xmm2 /* xmm0=d1c1b1a1d0c0b0a0 */
|
||||
punpckhdq xmm3, xmm2 /* xmm3=d3c3b3a3d2c2b2a2 */
|
||||
/* Free xmm2 */
|
||||
movdqa xmm2, xmm7 /* make a copy */
|
||||
punpckldq xmm2, xmm1 /* xmm2=d5c5b5a5d4c4b4a4 */
|
||||
|
||||
punpckhdq xmm7, xmm1 /* xmm7=d7c7b7a7d6c6b6a6 */
|
||||
movdqa xmm1, xmm0 /* make a copy */
|
||||
|
||||
punpcklqdq xmm0, xmm4 /* xmm0=h0g0f0e0d0c0b0a0 */
|
||||
punpckhqdq xmm1, xmm4 /* xmm1=h1g1g1e1d1c1b1a1 */
|
||||
|
||||
movdqa I(0), xmm0 /* save I(0) */
|
||||
movdqa I(1), xmm1 /* save I(1) */
|
||||
|
||||
movdqa xmm0, I(6) /* load h3g3g3e3h2g2f2e2 */
|
||||
movdqa xmm1, xmm3 /* make a copy */
|
||||
|
||||
punpcklqdq xmm1, xmm0 /* xmm1=h2g2f2e2d2c2b2a2 */
|
||||
punpckhqdq xmm3, xmm0 /* xmm3=h3g3f3e3d3c3b3a3 */
|
||||
|
||||
movdqa xmm4, xmm2 /* make a copy */
|
||||
punpcklqdq xmm4, xmm5 /* xmm4=h4g4f4e4d4c4b4a4 */
|
||||
|
||||
punpckhqdq xmm2, xmm5 /* xmm2=h5g5f5e5d5c5b5a5 */
|
||||
movdqa I(2), xmm1 /* save I(2) */
|
||||
|
||||
movdqa I(3), xmm3 /* save I(3) */
|
||||
movdqa I(4), xmm4 /* save I(4) */
|
||||
|
||||
movdqa I(5), xmm2 /* save I(5) */
|
||||
movdqa xmm5, xmm7 /* make a copy */
|
||||
|
||||
punpcklqdq xmm5, xmm6 /* xmm5=h6g6f6e6d6c6b6a6 */
|
||||
punpckhqdq xmm7, xmm6 /* xmm7=h7g7f7e7d7c7b7a7 */
|
||||
|
||||
movdqa I(6), xmm5 /* save I(6) */
|
||||
movdqa I(7), xmm7 /* save I(7) */
|
||||
|
||||
/******************************************************/
|
||||
/* Done with transpose - Let's do the forward DCT */
|
||||
/******************************************************/
|
||||
|
||||
movdqa xmm0, I(0) /* xmm0 = ip0 */
|
||||
movdqa xmm1, I(1) /* xmm1 = ip1 */
|
||||
|
||||
movdqa xmm2, I(3) /* xmm2 = ip3 */
|
||||
movdqa xmm3, I(5) /* xmm3 = ip5 */
|
||||
|
||||
movdqa xmm4, xmm0 /* xmm4 = ip0 */
|
||||
movdqa xmm5, xmm1 /* xmm5 = ip1 */
|
||||
|
||||
movdqa xmm6, xmm2 /* xmm6 = ip3 */
|
||||
movdqa xmm7, xmm3 /* xmm7 = ip5 */
|
||||
|
||||
paddsw xmm0, I(7) /* xmm0 = ip0 + ip7 */
|
||||
paddsw xmm1, I(2) /* xmm1 = ip1 + ip2 */
|
||||
|
||||
paddsw xmm2, I(4) /* xmm2 = ip3 + ip4 */
|
||||
paddsw xmm3, I(6) /* xmm3 = ip5 + ip6 */
|
||||
|
||||
psubsw xmm4, I(7) /* xmm4 = ip0 - ip7 */
|
||||
psubsw xmm5, I(2) /* xmm5 = ip1 - ip2 */
|
||||
|
||||
psubsw xmm0, xmm2 /* xmm0 = is07 - is34 */
|
||||
paddsw xmm2, xmm2 /* xmm2 = is34 * 2 */
|
||||
|
||||
psubsw xmm6, I(4) /* xmm6 = ip3 - ip4 */
|
||||
paddsw xmm2, xmm0 /* xmm2 = is07 + is34 */
|
||||
|
||||
psubsw xmm1, xmm3 /* xmm1 = is12 - is56 */
|
||||
movdqa TIRY, xmm0 /* save is07-is34 */
|
||||
|
||||
paddsw xmm3, xmm3 /* xmm3 = is56 * 2 */
|
||||
paddsw xmm3, xmm1 /* xmm3 = is12 + is56 */
|
||||
|
||||
psubsw xmm7, I(6) /* xmm7 = ip5 -ip6 */
|
||||
psubsw xmm5, xmm7 /* xmm5 = id12 - id56 */
|
||||
|
||||
paddsw xmm7, xmm7 /* xmm7 = id56 * 2 */
|
||||
paddsw xmm7, xmm5 /* xmm7 = id12 + id56 */
|
||||
/*---------------------------------------------------------*/
|
||||
/* op0 and op4
|
||||
/*---------------------------------------------------------*/
|
||||
#if 0
|
||||
movdqa xmm0, xmm2 /* xmm0 =xmm2= is0734 */
|
||||
pmulhw xmm2, C(4) /* xC4S4 * is0734 - is0734 */
|
||||
|
||||
paddw xmm2, xmm0 /* XC4S4 * is0734 */
|
||||
movdqa xmm0, xmm3 /* xmm0 =xmm3= is1256 */
|
||||
|
||||
pmulhw xmm3, C(4) /* xC4S4 * is1256 - is1256 */
|
||||
paddw xmm3, xmm0 /* xC4S4 * is1256 */
|
||||
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
paddsw xmm2, xmm3 /* xC4S4 * ( is0734 +is1256 ) */
|
||||
|
||||
psubsw xmm0, xmm3 /* xC4S4 * ( is0734 -is1256 ) */
|
||||
movdqa xmm3, xmm2
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddsw xmm3, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa O(0), xmm3
|
||||
|
||||
psrlw xmm0, 15
|
||||
paddsw xmm2, xmm0
|
||||
|
||||
movdqa O(4), xmm2
|
||||
|
||||
|
||||
#else
|
||||
|
||||
|
||||
psubsw xmm2, xmm3 /* xmm2 = is0734 - is1256 */
|
||||
paddsw xmm3, xmm3 /* xmm3 = is1256 * 2 */
|
||||
|
||||
movdqa xmm0, xmm2 /* xmm0 = is0734 - is1256 */
|
||||
paddsw xmm3, xmm2 /* xmm3 = is0734 + is1256 */
|
||||
|
||||
pmulhw xmm0, C(4) /* xmm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
|
||||
paddw xmm0, xmm2 /* xmm0 = xC4S4 * ( is0734 - is1256 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm0, xmm2 /* Truncate xmm0, now it is op[4] */
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psrlw xmm0, 15
|
||||
|
||||
paddw xmm0, xmm2
|
||||
psraw xmm0, 1
|
||||
|
||||
movdqa O(4), xmm0 /* op4, now xmm0,xmm2 are free */
|
||||
movdqa xmm2, xmm3 /* xmm2 = is0734 + is1256 */
|
||||
|
||||
|
||||
movdqa xmm0, xmm3 /* xmm0 = is0734 + is1256 */
|
||||
pmulhw xmm3, C(4) /* xmm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm3, xmm0 /* xmm3 = xC4S4 * ( is0734 +is1256 ) */
|
||||
|
||||
paddw xmm3, xmm2 /* Truncate xmm3, now it is op[0] */
|
||||
movdqa xmm2, xmm3
|
||||
|
||||
psrlw xmm3, 15
|
||||
paddw xmm3, xmm2
|
||||
|
||||
psraw xmm3, 1
|
||||
movdqa O(0), xmm3 /* save op0 */
|
||||
#endif
|
||||
/*---------------------------------------------------------*/
|
||||
/* op2 and op6
|
||||
/*---------------------------------------------------------*/
|
||||
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
|
||||
pmulhw xmm3, C(2) /* xmm3 = xC2S6 * irot_input_y - irot_input_y */
|
||||
|
||||
movdqa xmm2, TIRY /* xmm2 = irot_input_y */
|
||||
movdqa xmm0, xmm2 /* xmm0 = irot_input_y */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm3, xmm0 /* xmm3 = xC2S6 * irot_input_y */
|
||||
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
|
||||
|
||||
|
||||
movdqa xmm2, xmm5 /* xmm2 = id12 - id56 */
|
||||
pmulhw xmm0, C(6) /* xmm0 = xC6S2 * irot_input_x */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm0, xmm2 /* Truncated */
|
||||
|
||||
paddsw xmm3, xmm0 /* op[2] */
|
||||
movdqa xmm0, xmm3
|
||||
|
||||
psrlw xmm3, 15
|
||||
paddw xmm3, xmm0
|
||||
|
||||
psraw xmm3, 1
|
||||
movdqa O(2), xmm3 /* save op[2] */
|
||||
|
||||
|
||||
movdqa xmm0, xmm5 /* xmm0 = id12 - id56 */
|
||||
movdqa xmm2, xmm5 /* xmm0 = id12 - id56 */
|
||||
|
||||
pmulhw xmm5, C(2) /* xmm5 = xC2S6 * irot_input_x - irot_input_x */
|
||||
psrlw xmm2, 15
|
||||
|
||||
movdqa xmm3, TIRY /* xmm3 = irot_input_y */
|
||||
paddw xmm5, xmm0 /* xmm5 = xC2S6 * irot_input_x */
|
||||
|
||||
paddw xmm5, xmm2 /* Truncated */
|
||||
movdqa xmm2, xmm3 /* xmm2 = irot_input_y */
|
||||
|
||||
pmulhw xmm3, C(6) /* mm3 = xC6S2 * irot_input_y */
|
||||
psrlw xmm2, 15
|
||||
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
psubsw xmm3, xmm5 /* xmm3 = op[6] */
|
||||
|
||||
movdqa xmm5, xmm3
|
||||
psrlw xmm3, 15
|
||||
|
||||
paddw xmm3, xmm5
|
||||
psraw xmm3, 1
|
||||
|
||||
movdqa O(6), xmm3
|
||||
/*-----------------------------------------------------------------------*/
|
||||
/* icommon_product1, icommon_product2 */
|
||||
/*-----------------------------------------------------------------------*/
|
||||
movdqa xmm0, C(4) /* xmm0 = xC4s4 */
|
||||
movdqa xmm2, xmm1 /* xmm2 = is12 - is56 */
|
||||
|
||||
movdqa xmm3, xmm1 /* xmm3 = is12 - is56 */
|
||||
pmulhw xmm1, xmm0 /* xmm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
|
||||
|
||||
psrlw xmm2, 15
|
||||
paddw xmm1, xmm3 /* xmm1 = xC4S4 * ( is12 - is56 ) */
|
||||
|
||||
paddw xmm1, xmm2 /* Truncate xmm1, now it is icommon_product1 */
|
||||
movdqa xmm2, xmm7 /* xmm2 = id12 + id56 */
|
||||
|
||||
movdqa xmm3, xmm7 /* xmm3 = id12 + id56 */
|
||||
pmulhw xmm7, xmm0 /* xmm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
|
||||
|
||||
psrlw xmm2, 15 /* For trucation */
|
||||
paddw xmm7, xmm3 /* xmm7 = xC4S4 * ( id12 + id56 ) */
|
||||
|
||||
paddw xmm7, xmm2 /* Truncate xmm7, now it is icommon_product2 */
|
||||
/*---------------------------------------------------------*/
|
||||
pxor xmm0, xmm0 /* Clear xmm0 */
|
||||
psubsw xmm0, xmm6 /* xmm0 = - id34 */
|
||||
|
||||
psubsw xmm0, xmm7 /* xmm0 = - ( id34 + idcommon_product2 ) = irot_input_y for 17*/
|
||||
paddsw xmm6, xmm6 /* xmm6 = id34 * 2 */
|
||||
|
||||
paddsw xmm6, xmm0 /* xmm6 = id34 - icommon_product2 = irot_input_x for 35 */
|
||||
psubsw xmm4, xmm1 /* xmm4 = id07 - icommon_product1 = irot_input_x for 35*/
|
||||
|
||||
paddsw xmm1, xmm1 /* xmm1 = icommon_product1 * 2 */
|
||||
paddsw xmm1, xmm4 /* xmm1 = id07 + icommon_product1 = irot_input_x for 17*/
|
||||
|
||||
/*---------------------------------------------------------*/
|
||||
/* op1 and op7
|
||||
/*---------------------------------------------------------*/
|
||||
|
||||
movdqa xmm7, C(1) /* xC1S7 */
|
||||
movdqa xmm2, xmm1 /* xmm2 = irot_input_x */
|
||||
|
||||
movdqa xmm3, xmm1; /* xmm3 = irot_input_x */
|
||||
pmulhw xmm1, xmm7 /* xmm1 = xC1S7 * irot_input_x - irot_input_x */
|
||||
|
||||
movdqa xmm7, C(7) /* xC7S1 */
|
||||
psrlw xmm2, 15 /* for trucation */
|
||||
|
||||
paddw xmm1, xmm3 /* xmm1 = xC1S7 * irot_input_x */
|
||||
paddw xmm1, xmm2 /* Trucated */
|
||||
|
||||
pmulhw xmm3, xmm7 /* xmm3 = xC7S1 * irot_input_x */
|
||||
paddw xmm3, xmm2 /* Truncated */
|
||||
|
||||
movdqa xmm5, xmm0 /* xmm5 = irot_input_y */
|
||||
movdqa xmm2, xmm0 /* xmm2 = irot_input_y */
|
||||
|
||||
movdqa xmm7, C(1) /* xC1S7 */
|
||||
pmulhw xmm0, xmm7 /* xmm0 = xC1S7 * irot_input_y - irot_input_y */
|
||||
|
||||
movdqa xmm7, C(7) /* xC7S1 */
|
||||
psrlw xmm2, 15 /* for trucation */
|
||||
|
||||
paddw xmm0, xmm5 /* xmm0 = xC1S7 * irot_input_y */
|
||||
paddw xmm0, xmm2 /* Truncated */
|
||||
|
||||
pmulhw xmm5, xmm7 /* xmm5 = xC7S1 * irot_input_y */
|
||||
paddw xmm5, xmm2 /* Truncated */
|
||||
|
||||
psubsw xmm1, xmm5 /* xmm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = op[1] */
|
||||
paddsw xmm3, xmm0 /* xmm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = op[7] */
|
||||
|
||||
movdqa xmm5, xmm1
|
||||
movdqa xmm0, xmm3
|
||||
|
||||
psrlw xmm1, 15
|
||||
psrlw xmm3, 15
|
||||
|
||||
paddw xmm1, xmm5
|
||||
paddw xmm3, xmm0
|
||||
|
||||
psraw xmm1, 1
|
||||
psraw xmm3, 1
|
||||
|
||||
|
||||
movdqa O(1), xmm1
|
||||
movdqa O(7), xmm3
|
||||
/*---------------------------------------------------------*/
|
||||
/* op3 and op5
|
||||
/*---------------------------------------------------------*/
|
||||
movdqa xmm0, C(3) /* xC3S5 */
|
||||
movdqa xmm1, C(5) /* xC5S3 */
|
||||
|
||||
movdqa xmm5,xmm6 /* irot_input_x */
|
||||
movdqa xmm7,xmm6 /* irot_input_x */
|
||||
|
||||
movdqa xmm2,xmm4 /* irot_input_y */
|
||||
movdqa xmm3,xmm4 /* irot_input_y */
|
||||
|
||||
pmulhw xmm4,xmm0 /* xmm4 = xC3S5 * irot_input_x - irot_input_x */
|
||||
pmulhw xmm6,xmm1 /* xmm6 = xC5S3 * irot_input_y - irot_input_y */
|
||||
|
||||
psrlw xmm2,15 /* for trucation */
|
||||
psrlw xmm5,15 /* for trucation */
|
||||
|
||||
paddw xmm4,xmm3 /* xmm4 = xC3S5 * irot_input_x */
|
||||
paddw xmm6,xmm7 /* xmm6 = xC5S3 * irot_input_y */
|
||||
|
||||
paddw xmm4,xmm2 /* Truncated */
|
||||
paddw xmm6,xmm5 /* Truncated */
|
||||
|
||||
psubsw xmm4,xmm6 /* op [3] */
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
psrlw xmm4,15
|
||||
paddw xmm4,xmm6
|
||||
|
||||
psraw xmm4,1
|
||||
movdqa O(3),xmm4 /* Save Op[3] */
|
||||
|
||||
movdqa xmm4,xmm3 /* irot_input_y */
|
||||
movdqa xmm6,xmm7 /* irot_input_x */
|
||||
|
||||
pmulhw xmm3,xmm1 /* mm3 = xC5S3 * irot_input_x - irot_input_x */
|
||||
pmulhw xmm7,xmm0 /* mm7 = xC3S5 * irot_input_y - irot_input_y */
|
||||
|
||||
paddw xmm4,xmm2 /* Trucated */
|
||||
paddw xmm6,xmm5 /* Trucated */
|
||||
|
||||
paddw xmm3,xmm4 /* xmm3 = xC5S3 * irot_input_x */
|
||||
paddw xmm7,xmm6 /* mm7 = xC3S5 * irot_input_y */
|
||||
|
||||
paddw xmm3,xmm7 /* Op[5] */
|
||||
movdqa xmm7,xmm3
|
||||
|
||||
psrlw xmm3,15
|
||||
paddw xmm3,xmm7
|
||||
|
||||
psraw xmm3,1
|
||||
movdqa O(5),xmm3 /* Save Op[5] */
|
||||
/*---------------------------------------------------------*/
|
||||
/* End of 8 1-D FDCT */
|
||||
/*---------------------------------------------------------*/
|
||||
|
||||
}/* end of _asm code section */
|
||||
}
|
||||
|
||||
|
||||
|
||||
1053
Src/libvpShared/corelibs/cdxv/vputil/win32/filtmmx.c
Normal file
1053
Src/libvpShared/corelibs/cdxv/vputil/win32/filtmmx.c
Normal file
File diff suppressed because it is too large
Load Diff
790
Src/libvpShared/corelibs/cdxv/vputil/win32/filtwmt.c
Normal file
790
Src/libvpShared/corelibs/cdxv/vputil/win32/filtwmt.c
Normal file
@@ -0,0 +1,790 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : newLoopTest_asm.c
|
||||
*
|
||||
* Description : Codec specific functions
|
||||
*
|
||||
* AUTHOR : Yaowu Xu
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
* 1.02 YWX 03-Nov-00 Changed confusing variable name
|
||||
* 1.01 YWX 02-Nov-00 Added the set of functions
|
||||
* 1.00 YWX 19-Oct-00 configuration baseline
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Header Frames
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#define STRICT /* Strict type checking. */
|
||||
#include "codec_common.h"
|
||||
#include <math.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Module constants.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
#define FILTER_WEIGHT 128
|
||||
#define FILTER_SHIFT 7
|
||||
__declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};
|
||||
|
||||
|
||||
__declspec(align(16)) INT16 BilinearFilters_wmt[8][16] =
|
||||
{
|
||||
{ 128,128,128,128,128,128,128,128, 0, 0, 0, 0, 0, 0, 0, 0 },
|
||||
{ 112,112,112,112,112,112,112,112, 16, 16, 16, 16, 16, 16, 16, 16 },
|
||||
{ 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
|
||||
{ 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
|
||||
{ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
|
||||
{ 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
|
||||
{ 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
|
||||
{ 16, 16, 16, 16, 16, 16, 16, 16, 112,112,112,112,112,112,112,112 }
|
||||
};
|
||||
|
||||
extern __declspec(align(16)) INT16 BicubicFilters_mmx[17][8][32];
|
||||
|
||||
_inline
|
||||
void FilterBlock1d_h_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi+ 16] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
mov edi,OutputPtr
|
||||
mov esi,SrcPtr
|
||||
dec esi
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
|
||||
// kernel 0 and 3 are potentially negative taps. These negative tap filters
|
||||
// must be done first or we could have problems saturating our high value
|
||||
// tap filters
|
||||
movdqu xmm3, [esi] ; xmm3 = p-1..p14
|
||||
movdqu xmm4, xmm3 ; xmm4 = p-1..p14
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
psrldq xmm4, 3 ; xmm4 = p2..p13
|
||||
movdqa xmm5, xmm4 ; xmm5 = p2..p13
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = p2..p7
|
||||
pmullw xmm5, xmm7 ; xmm5 *= kernel 3 modifiers
|
||||
paddsw xmm3, xmm5 ; xmm3 += xmm5
|
||||
|
||||
movdqu xmm4, [esi+1] ; xmm4 = p0..p13
|
||||
movdqa xmm5, xmm4 ; xmm5 = p0..p13
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
|
||||
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
|
||||
paddsw xmm3, xmm5 ; xmm3 += xmm5
|
||||
|
||||
psrldq xmm4, 1 ; xmm4 = p1..p13
|
||||
movdqa xmm5, xmm4 ; xmm5 = p1..p13
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = p1..p7
|
||||
pmullw xmm5, xmm6 ; xmm5 *= kernel 2 modifiers
|
||||
paddsw xmm3, xmm5 ; xmm3 += xmm5
|
||||
|
||||
paddsw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
packuswb xmm3, xmm0 ; pack and saturate
|
||||
|
||||
movdq2q mm0, xmm3
|
||||
movq [edi],mm0 ; store the results in the destination
|
||||
|
||||
add esi,SrcPixelsPerLine ; next line
|
||||
add edi,eax;
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
}
|
||||
}
|
||||
|
||||
_inline
|
||||
void FilterBlock1d_v_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm6, [edi + 32] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm7, [edi + 48] ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
mov edx, PixelsPerLine
|
||||
mov edi, OutputPtr
|
||||
mov esi, SrcPtr
|
||||
sub esi, PixelsPerLine
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p0..p16
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
add esi, edx ; move source forward 1 line to avoid 3 * pitch
|
||||
|
||||
movdqu xmm4, [esi+2*edx] ; xmm4 = p0..p16
|
||||
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
|
||||
pmullw xmm4, xmm7 ; xmm4 *= kernel 3 modifiers.
|
||||
paddsw xmm3, xmm4 ; xmm3 += xmm4
|
||||
|
||||
movdqu xmm4, [esi ] ; xmm4 = p0..p16
|
||||
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
|
||||
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
|
||||
paddsw xmm3, xmm4 ; xmm3 += xmm4
|
||||
|
||||
movdqu xmm4, [esi +edx] ; xmm4 = p0..p16
|
||||
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
|
||||
pmullw xmm4, xmm6 ; xmm4 *= kernel 2 modifiers.
|
||||
paddsw xmm3, xmm4 ; xmm3 += xmm4
|
||||
|
||||
|
||||
|
||||
paddsw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
|
||||
movdq2q mm0, xmm3
|
||||
movq [edi],mm0 ; store the results in the destination
|
||||
|
||||
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
||||
// recon block should be in cache this shouldn't cost much. Its obviously
|
||||
// avoidable!!!.
|
||||
add edi,eax;
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
_inline
|
||||
void FilterBlock1d_hb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
mov edi,OutputPtr
|
||||
mov esi,SrcPtr
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p-1..p14
|
||||
movdqu xmm5, xmm3 ; xmm4 = p-1..p14
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
psrldq xmm5, 1 ; xmm4 = p0..p13
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
|
||||
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
|
||||
paddw xmm3, xmm5 ; xmm3 += xmm5
|
||||
|
||||
paddw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
|
||||
movdq2q mm0, xmm3
|
||||
movq [edi],mm0 ; store the results in the destination
|
||||
|
||||
add esi,SrcPixelsPerLine ; next line
|
||||
add edi,eax;
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
}
|
||||
}
|
||||
|
||||
_inline
|
||||
void FilterBlock1d_vb8_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
|
||||
mov edx, PixelsPerLine
|
||||
mov edi, OutputPtr
|
||||
mov esi, SrcPtr
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p0..p16
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
|
||||
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
|
||||
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
|
||||
paddw xmm3, xmm4 ; xmm3 += xmm4
|
||||
|
||||
paddw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
|
||||
movdq2q mm0, xmm3
|
||||
movq [edi],mm0 ; store the results in the destination
|
||||
|
||||
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
||||
// recon block should be in cache this shouldn't cost much. Its obviously
|
||||
// avoidable!!!.
|
||||
add esi,edx
|
||||
add edi,eax
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : FilterBlock2dBil
|
||||
*
|
||||
* INPUTS : Pointer to source data
|
||||
*
|
||||
* OUTPUTS : Filtered data
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Applies a bilinear filter on the intput data to produce
|
||||
* a predictor block (UINT16)
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
_inline
|
||||
void FilterBlock2dBil_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
mov eax, HFilter ;
|
||||
mov edi, OutputPtr ;
|
||||
mov esi, SrcPtr ;
|
||||
lea ecx, [edi+64] ;
|
||||
mov edx, SrcPixelsPerLine ;
|
||||
|
||||
movdqa xmm1, [eax] ;
|
||||
movdqa xmm2, [eax+16] ;
|
||||
|
||||
mov eax, VFilter ;
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
// get the first horizontal line done ;
|
||||
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
|
||||
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
|
||||
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
|
||||
|
||||
pmullw xmm3, xmm1 ;
|
||||
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
pmullw xmm4, xmm2 ;
|
||||
paddw xmm3, xmm4 ;
|
||||
|
||||
paddw xmm3, rd ;
|
||||
psraw xmm3, FILTER_SHIFT ; ready for output
|
||||
|
||||
movdqa xmm5, xmm3 ;
|
||||
|
||||
add esi, edx ; next line
|
||||
NextRow:
|
||||
pmullw xmm5, [eax] ;
|
||||
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
|
||||
|
||||
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
|
||||
pmullw xmm3, xmm1 ;
|
||||
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
movdqa xmm6, xmm5 ;
|
||||
pmullw xmm4, xmm2 ;
|
||||
|
||||
paddw xmm3, xmm4 ;
|
||||
paddw xmm3, rd ;
|
||||
|
||||
psraw xmm3, FILTER_SHIFT ; ready for output
|
||||
movdqa xmm5, xmm3 ; make a copy for the next row
|
||||
|
||||
pmullw xmm3, [eax+16] ;
|
||||
paddw xmm6, xmm3 ;
|
||||
|
||||
|
||||
paddw xmm6, rd ; xmm6 += round value
|
||||
psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
|
||||
|
||||
packuswb xmm6, xmm0 ; pack and unpack to saturate
|
||||
movdq2q mm0, xmm6
|
||||
|
||||
movq [edi], mm0 ; store the results in the destination
|
||||
add esi, edx ; next line
|
||||
add edi, 8 ;
|
||||
|
||||
cmp edi, ecx ;
|
||||
jne NextRow
|
||||
|
||||
}
|
||||
|
||||
// First filter 1d Horizontal
|
||||
//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
|
||||
// Now filter Verticaly
|
||||
//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
|
||||
|
||||
|
||||
}
|
||||
|
||||
_inline
|
||||
void FilterUnpackBlock2dBil_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
mov eax, HFilter ;
|
||||
mov edi, OutputPtr ;
|
||||
mov esi, SrcPtr ;
|
||||
lea ecx, [edi+128] ;
|
||||
mov edx, SrcPixelsPerLine ;
|
||||
|
||||
movdqa xmm1, [eax] ;
|
||||
movdqa xmm2, [eax+16] ;
|
||||
|
||||
mov eax, VFilter ;
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
// get the first horizontal line done ;
|
||||
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
|
||||
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
|
||||
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
|
||||
|
||||
pmullw xmm3, xmm1 ;
|
||||
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
pmullw xmm4, xmm2 ;
|
||||
paddw xmm3, xmm4 ;
|
||||
|
||||
paddw xmm3, rd ;
|
||||
psraw xmm3, FILTER_SHIFT ; ready for output
|
||||
|
||||
movdqa xmm5, xmm3 ;
|
||||
|
||||
add esi, edx ; next line
|
||||
NextRow:
|
||||
pmullw xmm5, [eax] ;
|
||||
movdqu xmm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
|
||||
|
||||
psrldq xmm4, 1 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 xx
|
||||
pmullw xmm3, xmm1 ;
|
||||
punpcklbw xmm4, xmm0 ; 00 01 02 03 04 05 06 07
|
||||
|
||||
movdqa xmm6, xmm5 ;
|
||||
pmullw xmm4, xmm2 ;
|
||||
|
||||
paddw xmm3, xmm4 ;
|
||||
paddw xmm3, rd ;
|
||||
|
||||
psraw xmm3, FILTER_SHIFT ; ready for output
|
||||
movdqa xmm5, xmm3 ; make a copy for the next row
|
||||
|
||||
pmullw xmm3, [eax+16] ;
|
||||
paddw xmm6, xmm3 ;
|
||||
|
||||
|
||||
paddw xmm6, rd ; xmm6 += round value
|
||||
psraw xmm6, FILTER_SHIFT ; xmm6 /= 128
|
||||
|
||||
movdqu [edi], xmm6;
|
||||
|
||||
/*
|
||||
packuswb xmm6, xmm0 ; pack and unpack to saturate
|
||||
movdq2q mm0, xmm6
|
||||
|
||||
movq [edi], mm0 ; store the results in the destination
|
||||
*/
|
||||
add esi, edx ; next line
|
||||
add edi, 16 ;
|
||||
|
||||
cmp edi, ecx ;
|
||||
jne NextRow
|
||||
|
||||
}
|
||||
|
||||
// First filter 1d Horizontal
|
||||
//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
|
||||
// Now filter Verticaly
|
||||
//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
|
||||
|
||||
|
||||
}
|
||||
_inline
|
||||
void FilterUnpackBlock1d_hb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
mov edi,OutputPtr
|
||||
mov esi,SrcPtr
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p-1..p14
|
||||
movdqu xmm5, xmm3 ; xmm4 = p-1..p14
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
psrldq xmm5, 1 ; xmm4 = p0..p13
|
||||
punpcklbw xmm5, xmm0 ; xmm5 = p0..p7
|
||||
pmullw xmm5, xmm2 ; xmm5 *= kernel 1 modifiers
|
||||
paddw xmm3, xmm5 ; xmm3 += xmm5
|
||||
|
||||
paddw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
|
||||
/*
|
||||
packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
movdq2q mm0, xmm3
|
||||
*/
|
||||
|
||||
movdqu [edi],xmm3 ; store the results in the destination
|
||||
|
||||
add esi,SrcPixelsPerLine ; next line
|
||||
add edi,eax;
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
}
|
||||
}
|
||||
|
||||
_inline
|
||||
void FilterUnpackBlock1d_vb8_wmt( UINT8 *SrcPtr, INT16 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
|
||||
mov edi, Filter
|
||||
movdqa xmm1, [edi] ; xmm3 *= kernel 0 modifiers.
|
||||
movdqa xmm2, [edi + 16] ; xmm3 *= kernel 0 modifiers.
|
||||
mov edx, PixelsPerLine
|
||||
mov edi, OutputPtr
|
||||
mov esi, SrcPtr
|
||||
mov ecx, DWORD PTR OutputHeight
|
||||
mov eax, OutputWidth ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p0..p16
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p0..p8
|
||||
pmullw xmm3, xmm1 ; xmm3 *= kernel 0 modifiers.
|
||||
|
||||
movdqu xmm4, [esi +edx ] ; xmm4 = p0..p16
|
||||
punpcklbw xmm4, xmm0 ; xmm4 = p0..p8
|
||||
pmullw xmm4, xmm2 ; xmm4 *= kernel 1 modifiers.
|
||||
paddw xmm3, xmm4 ; xmm3 += xmm4
|
||||
|
||||
paddw xmm3, rd ; xmm3 += round value
|
||||
psraw xmm3, FILTER_SHIFT ; xmm3 /= 128
|
||||
|
||||
/*packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
|
||||
movdq2q mm0, xmm3
|
||||
*/
|
||||
movdqu [edi],xmm3 ; store the results in the destination
|
||||
|
||||
// the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
||||
// recon block should be in cache this shouldn't cost much. Its obviously
|
||||
// avoidable!!!.
|
||||
add esi,edx
|
||||
add edi,eax
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : FilterBlockBil_8
|
||||
*
|
||||
* INPUTS : ReconPtr1, ReconPtr12
|
||||
* Two pointers into the block of data to be filtered
|
||||
* These pointers bound the fractional pel position
|
||||
* PixelsPerLine
|
||||
* Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
|
||||
* Modx, ModY
|
||||
* The fractional pel bits used to select a filter.
|
||||
*
|
||||
*
|
||||
* OUTPUTS : ReconRefPtr
|
||||
* A pointer to an 8x8 buffer into which UINT8 filtered data is written.
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Produces a bilinear filtered fractional pel prediction block
|
||||
* with UINT8 output
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
|
||||
{
|
||||
int diff;
|
||||
|
||||
// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
|
||||
diff=ReconPtr2-ReconPtr1;
|
||||
|
||||
// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
|
||||
// This works out to be what we want... despite the pointer swapping that goes on below.
|
||||
// For example... if the X component of the vector is a +ve ModX = X%8.
|
||||
// if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
|
||||
|
||||
if(diff<0)
|
||||
{ // swap pointers so ReconPtr1 smaller
|
||||
UINT8 *temp=ReconPtr1;
|
||||
ReconPtr1=ReconPtr2;
|
||||
ReconPtr2=temp;
|
||||
diff= (int)(ReconPtr2-ReconPtr1);
|
||||
}
|
||||
|
||||
if( diff==1 )
|
||||
{
|
||||
FilterBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
|
||||
}
|
||||
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
|
||||
{
|
||||
FilterBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
|
||||
{
|
||||
FilterBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
//FilterBlock2dBil_8_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
|
||||
{
|
||||
FilterBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
//FilterBlock2dBil_8_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
}
|
||||
}
|
||||
|
||||
_inline void UnpackBlock_wmt( UINT8 *SrcPtr, UINT16 *OutputPtr, UINT32 SrcPixelsPerLine )
|
||||
{
|
||||
__asm
|
||||
{
|
||||
mov edi,OutputPtr
|
||||
mov esi,SrcPtr
|
||||
|
||||
mov ecx, 8
|
||||
mov eax, 16 ; destination pitch?
|
||||
pxor xmm0, xmm0 ; xmm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
movdqu xmm3, [esi] ; xmm3 = p-1..p14
|
||||
punpcklbw xmm3, xmm0 ; xmm3 = p-1..p6
|
||||
movdqu [edi],xmm3 ; store the results in the destination
|
||||
|
||||
add esi,SrcPixelsPerLine ; next line
|
||||
add edi,eax;
|
||||
|
||||
dec ecx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : FilterBlock2d
|
||||
*
|
||||
* INPUTS : Pointer to source data
|
||||
*
|
||||
* OUTPUTS : Filtered data
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Applies a 2d 4 tap filter on the intput data to produce
|
||||
* a predictor block (UINT16)
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void FilterBlock2d_wmt( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
|
||||
{
|
||||
|
||||
UINT8 Intermediate[256];
|
||||
|
||||
// First filter 1d Horizontal
|
||||
FilterBlock1d_h_wmt(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );
|
||||
|
||||
// Now filter Verticaly
|
||||
FilterBlock1d_v_wmt(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : FilterBlock
|
||||
*
|
||||
* INPUTS : ReconPtr1, ReconPtr12
|
||||
* Two pointers into the block of data to be filtered
|
||||
* These pointers bound the fractional pel position
|
||||
* PixelsPerLine
|
||||
* Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
|
||||
* Modx, ModY
|
||||
* The fractional pel bits used to select a filter.
|
||||
* UseBicubic
|
||||
* Whether to use the bicubuc filter set or the bilinear set
|
||||
*
|
||||
*
|
||||
* OUTPUTS : ReconRefPtr
|
||||
* A pointer to an 8x8 buffer into which the filtered data is written.
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Produces a filtered fractional pel prediction block
|
||||
* using bilinear or bicubic filters
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
|
||||
{
|
||||
int diff;
|
||||
UINT8 Intermediate[256];
|
||||
|
||||
// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
|
||||
diff=ReconPtr2-ReconPtr1;
|
||||
|
||||
// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
|
||||
// This works out to be what we want... despite the pointer swapping that goes on below.
|
||||
// For example... if the X component of the vector is a +ve ModX = X%8.
|
||||
// if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.
|
||||
|
||||
if(diff<0)
|
||||
{ // swap pointers so ReconPtr1 smaller
|
||||
UINT8 *temp=ReconPtr1;
|
||||
ReconPtr1=ReconPtr2;
|
||||
ReconPtr2=temp;
|
||||
diff= (int)(ReconPtr2-ReconPtr1);
|
||||
}
|
||||
|
||||
if(!diff)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if(UseBicubic)
|
||||
{
|
||||
if( diff==1 )
|
||||
{ // Fractional pixel in horizontal only
|
||||
FilterBlock1d_h_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
|
||||
}
|
||||
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
|
||||
{
|
||||
FilterBlock1d_v_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
|
||||
{
|
||||
FilterBlock2d_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
|
||||
{
|
||||
FilterBlock2d_wmt( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
|
||||
}
|
||||
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
if( diff==1 )
|
||||
{
|
||||
FilterUnpackBlock1d_hb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 16, BilinearFilters_wmt[ModX] );
|
||||
|
||||
// Fractional pixel in horizontal only
|
||||
/*
|
||||
FilterBlock1d_hb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_wmt[ModX] );
|
||||
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
|
||||
*/
|
||||
|
||||
}
|
||||
else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only
|
||||
{
|
||||
FilterUnpackBlock1d_vb8_wmt(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 16, BilinearFilters_wmt[ModY]);
|
||||
/*
|
||||
FilterBlock1d_vb8_wmt(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_wmt[ModY]);
|
||||
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
|
||||
*/
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right
|
||||
{
|
||||
|
||||
FilterUnpackBlock2dBil_wmt( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
/*
|
||||
FilterBlock2dBil_wmt( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
|
||||
*/
|
||||
}
|
||||
else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left
|
||||
{
|
||||
FilterUnpackBlock2dBil_wmt( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
/*
|
||||
FilterBlock2dBil_wmt( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_wmt[ModX], BilinearFilters_wmt[ModY] );
|
||||
UnpackBlock_wmt( Intermediate, ReconRefPtr, 8 );
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
2156
Src/libvpShared/corelibs/cdxv/vputil/win32/mmxidct.c
Normal file
2156
Src/libvpShared/corelibs/cdxv/vputil/win32/mmxidct.c
Normal file
File diff suppressed because it is too large
Load Diff
856
Src/libvpShared/corelibs/cdxv/vputil/win32/mmxrecon.c
Normal file
856
Src/libvpShared/corelibs/cdxv/vputil/win32/mmxrecon.c
Normal file
@@ -0,0 +1,856 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : OptFunctions.c
|
||||
*
|
||||
* Description : MMX or otherwise processor specific
|
||||
* optimised versions of functions
|
||||
*
|
||||
* AUTHOR : Paul Wilkins
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
* 1.07 JBB 26/01/01 Removed unused function
|
||||
* 1.06 YWX 23/05/00 Remove the clamping in MmxReconPostProcess()
|
||||
* 1.05 YWX 15/05/00 Added MmxReconPostProcess()
|
||||
* 1.04 SJL 03/14/00 Added in Tim's versions of MmxReconInter and MmxReconInterHalfPixel2.
|
||||
* 1.03 PGW 12/10/99 Changes to reduce uneccessary dependancies.
|
||||
* 1.02 PGW 30/08/99 Minor changes to MmxReconInterHalfPixel2().
|
||||
* 1.01 PGW 13/07/99 Changes to keep reconstruction data to 16 bit
|
||||
* 1.00 PGW 14/06/99 Configuration baseline
|
||||
*
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/*
|
||||
Use Tim's optimized version.
|
||||
*/
|
||||
#define USING_TIMS 1
|
||||
|
||||
/****************************************************************************
|
||||
* Header Files
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
#define STRICT // Strict type checking.
|
||||
|
||||
#include "codec_common.h"
|
||||
|
||||
#include "reconstruct.h"
|
||||
|
||||
/****************************************************************************
|
||||
* Module constants.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Imports.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
extern INT32 * XX_LUT;
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Global Variables
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Functions
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Module Statics
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
INT16 Ones[4] = {1,1,1,1};
|
||||
INT16 OneTwoEight[4] = {128,128,128,128};
|
||||
UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
|
||||
|
||||
#pragma warning( disable : 4799 ) // Disable no emms instruction warning!
|
||||
/****************************************************************************
|
||||
* Forward References
|
||||
*****************************************************************************
|
||||
*/
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : MMXReconIntra
|
||||
*
|
||||
* INPUTS : INT16 * idct
|
||||
* Pointer to the output from the idct for this block
|
||||
*
|
||||
* UINT32 stride
|
||||
* Line Length in pixels in recon and reference images
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* OUTPUTS : UINT8 * dest
|
||||
* The reconstruction buffer
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs an intra block - MMX version
|
||||
*
|
||||
* SPECIAL NOTES : Tim Murphy's optimized version
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
|
||||
{
|
||||
(void) TmpDataBuffer;
|
||||
__asm
|
||||
{
|
||||
// u pipe
|
||||
// v pipe
|
||||
mov eax,[idct] ; Signed 16 bit inputs
|
||||
mov edx,[dest] ; Signed 8 bit outputs
|
||||
movq mm0,[Eight128s] ; Set mm0 to 0x8080808080808080
|
||||
;
|
||||
mov ebx,[stride] ; Line stride in output buffer
|
||||
lea ecx,[eax+128] ; Endpoint in input buffer
|
||||
loop_label: ;
|
||||
movq mm2,[eax] ; First four input values
|
||||
;
|
||||
packsswb mm2,[eax+8] ; pack with next(high) four values
|
||||
por mm0,mm0 ; stall
|
||||
pxor mm2,mm0 ; Convert result to unsigned (same as add 128)
|
||||
lea eax,[eax + 16] ; Step source buffer
|
||||
cmp eax,ecx ; are we done
|
||||
;
|
||||
movq [edx],mm2 ; store results
|
||||
;
|
||||
lea edx,[edx+ebx] ; Step output buffer
|
||||
jc loop_label ; Loop back if we are not done
|
||||
}
|
||||
// 6c/8 elts = 9c/8 = 1.125 c/pix
|
||||
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : MmxReconInter
|
||||
*
|
||||
* INPUTS : UINT8 * RefPtr
|
||||
* The last frame reference
|
||||
*
|
||||
* INT16 * ChangePtr
|
||||
* Pointer to the change data
|
||||
*
|
||||
* UINT32 LineStep
|
||||
* Line Length in pixels in recon and ref images
|
||||
*
|
||||
* OUTPUTS : UINT8 * ReconPtr
|
||||
* The reconstruction
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs data from last data and change
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
#if USING_TIMS
|
||||
void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
(void) TmpDataBuffer;
|
||||
|
||||
_asm {
|
||||
push edi
|
||||
;; mov ebx, [ref]
|
||||
;; mov ecx, [diff]
|
||||
;; mov eax, [dest]
|
||||
;; mov edx, [stride]
|
||||
mov ebx, [RefPtr]
|
||||
mov ecx, [ChangePtr]
|
||||
mov eax, [ReconPtr]
|
||||
mov edx, [LineStep]
|
||||
pxor mm0, mm0
|
||||
lea edi, [ecx + 128]
|
||||
;
|
||||
L:
|
||||
movq mm2, [ebx] ; (+3 misaligned) 8 reference pixels
|
||||
;
|
||||
movq mm4, [ecx] ; first 4 changes
|
||||
movq mm3, mm2
|
||||
movq mm5, [ecx + 8] ; last 4 changes
|
||||
punpcklbw mm2, mm0 ; turn first 4 refs into positive 16-bit #s
|
||||
paddsw mm2, mm4 ; add in first 4 changes
|
||||
punpckhbw mm3, mm0 ; turn last 4 refs into positive 16-bit #s
|
||||
paddsw mm3, mm5 ; add in last 4 changes
|
||||
add ebx, edx ; next row of reference pixels
|
||||
packuswb mm2, mm3 ; pack result to unsigned 8-bit values
|
||||
lea ecx, [ecx + 16] ; next row of changes
|
||||
cmp ecx, edi ; are we done?
|
||||
;
|
||||
movq [eax], mm2 ; store result
|
||||
;
|
||||
lea eax, [eax+edx] ; next row of output
|
||||
jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
|
||||
|
||||
pop edi
|
||||
}
|
||||
}
|
||||
#else
|
||||
void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
|
||||
// Note that the line step for the change data is assumed to be 8 * 32 bits.
|
||||
__asm
|
||||
{
|
||||
// Set up data pointers
|
||||
mov eax,dword ptr [ReconPtr]
|
||||
mov ebx,dword ptr [RefPtr]
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
mov edx,dword ptr [LineStep]
|
||||
pxor mm6, mm6 ; Blank mmx6
|
||||
|
||||
// Row 1
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 2
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 3
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 4
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 5
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 6
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 7
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
add ebx,edx ; Step the reference pointer.
|
||||
add ecx,16 ; Step the change pointer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
// Row 8
|
||||
// Load the data values. The change data needs to be unpacked to words
|
||||
movq mm0,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data
|
||||
paddsw mm0, mm2 ; First 4 values
|
||||
paddsw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [eax],mm0 ; Write the data out to the results buffer
|
||||
|
||||
//emms ; Clear the MMX state.
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : MmxReconInterHalfPixel2
|
||||
*
|
||||
* INPUTS : UINT8 * RefPtr1, RefPtr2
|
||||
* The last frame reference
|
||||
*
|
||||
* INT16 * ChangePtr
|
||||
* Pointer to the change data
|
||||
*
|
||||
* UINT32 LineStep
|
||||
* Line Length in pixels in recon and ref images
|
||||
*
|
||||
*
|
||||
* OUTPUTS : UINT8 * ReconPtr
|
||||
* The reconstruction
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs data from half pixel reference data and change.
|
||||
* Half pixel data interpolated from 2 references.
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
#if USING_TIMS
|
||||
|
||||
#define A 0
|
||||
|
||||
void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
|
||||
UINT8 * RefPtr1, UINT8 * RefPtr2,
|
||||
INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
# if A
|
||||
static culong FourOnes[2] = { 65537, 65537}; // only read once
|
||||
# endif
|
||||
(void) TmpDataBuffer;
|
||||
|
||||
_asm {
|
||||
push esi
|
||||
push edi
|
||||
|
||||
;; mov ecx, [diff]
|
||||
;; mov esi, [ref1]
|
||||
;; mov edi, [ref2]
|
||||
;; mov ebx, [dest]
|
||||
;; mov edx, [stride]
|
||||
|
||||
mov ecx, [ChangePtr]
|
||||
mov esi, [RefPtr1]
|
||||
mov edi, [RefPtr2]
|
||||
mov ebx, [ReconPtr]
|
||||
mov edx, [LineStep]
|
||||
|
||||
lea eax, [ecx+128]
|
||||
|
||||
# if A
|
||||
movq mm1, [FourOnes]
|
||||
# endif
|
||||
|
||||
pxor mm0, mm0
|
||||
L:
|
||||
movq mm2, [esi] ; (+3 misaligned) mm2 = row from ref1
|
||||
;
|
||||
movq mm4, [edi] ; (+3 misaligned) mm4 = row from ref2
|
||||
movq mm3, mm2
|
||||
punpcklbw mm2, mm0 ; mm2 = start ref1 as positive 16-bit #s
|
||||
movq mm5, mm4
|
||||
movq mm6, [ecx] ; mm6 = first 4 changes
|
||||
punpckhbw mm3, mm0 ; mm3 = end ref1 as positive 16-bit #s
|
||||
movq mm7, [ecx+8] ; mm7 = last 4 changes
|
||||
punpcklbw mm4, mm0 ; mm4 = start ref2 as positive 16-bit #s
|
||||
punpckhbw mm5, mm0 ; mm5 = end ref2 as positive 16-bit #s
|
||||
paddw mm2, mm4 ; mm2 = start (ref1 + ref2)
|
||||
paddw mm3, mm5 ; mm3 = end (ref1 + ref2)
|
||||
|
||||
# if A
|
||||
paddw mm2, mm1 ; rounding adjustment
|
||||
paddw mm3, mm1
|
||||
# endif
|
||||
|
||||
psrlw mm2, 1 ; mm2 = start (ref1 + ref2)/2
|
||||
psrlw mm3, 1 ; mm3 = end (ref1 + ref2)/2
|
||||
paddw mm2, mm6 ; add changes to start
|
||||
paddw mm3, mm7 ; add changes to end
|
||||
lea ecx, [ecx+16] ; next row idct
|
||||
packuswb mm2, mm3 ; pack start|end to unsigned 8-bit
|
||||
add esi, edx ; next row ref1
|
||||
add edi, edx ; next row ref2
|
||||
cmp ecx, eax
|
||||
movq [ebx], mm2 ; store result
|
||||
;
|
||||
lea ebx, [ebx+edx]
|
||||
jc L ; 22c / 8 elts = 33c / 8 pixels = 4.125 c/pix
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
}
|
||||
}
|
||||
|
||||
#undef A
|
||||
|
||||
#else
|
||||
void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
|
||||
UINT8 * RefPtr1, UINT8 * RefPtr2,
|
||||
INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
UINT8 * TmpDataPtr = (UINT8 *)TmpDataBuffer->TmpReconBuffer;
|
||||
|
||||
// Note that the line step for the change data is assumed to be 8 * 32 bits.
|
||||
__asm
|
||||
{
|
||||
pxor mm6, mm6 ; Blank mmx6
|
||||
|
||||
// Set up data pointers
|
||||
mov eax,dword ptr [RefPtr1]
|
||||
mov ebx,dword ptr [RefPtr2]
|
||||
mov edx,dword ptr [LineStep]
|
||||
|
||||
// Row 1
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
|
||||
// Load the data values (Ref1 and Ref2) and unpack to signed 16 bit values
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 2
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,16
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm3, mm2 ; Copy data
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+8],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 3
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,32
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+16],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 4
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,48
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+24],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 5
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,64
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+32],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 6
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,80
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+40],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 7
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,96
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+48],mm0 ; Write the data out to the temporary results buffer
|
||||
add eax,edx ; Step the reference pointers
|
||||
add ebx,edx
|
||||
|
||||
// Row 8
|
||||
// Load the change pointer
|
||||
mov ecx,dword ptr [ChangePtr]
|
||||
add ecx,112
|
||||
|
||||
// Load the data values (Ref1 and Ref2).
|
||||
movq mm0,dword ptr [eax] ; Load 8 elements of source data
|
||||
movq mm2,dword ptr [ebx] ; Load 8 elements of source data
|
||||
movq mm1, mm0 ; Copy data
|
||||
movq mm3, mm2 ; Copy data
|
||||
|
||||
punpcklbw mm0, mm6 ; Low bytes to words
|
||||
punpckhbw mm1, mm6 ; High bytes to words
|
||||
punpcklbw mm2, mm6 ; Low bytes to words
|
||||
punpckhbw mm3, mm6 ; High bytes to words
|
||||
|
||||
// Average Ref1 and Ref2
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm3 ; Second 4 values
|
||||
psrlw mm0, 1
|
||||
psrlw mm1, 1
|
||||
|
||||
// Load 8 elements of 16 bit change data
|
||||
movq mm2,dword ptr [ecx] ; Load 4 elements of change data
|
||||
movq mm4,dword ptr [ecx+8] ; Load next 4 elements of change data
|
||||
|
||||
// Sum the data reference and difference data
|
||||
paddw mm0, mm2 ; First 4 values
|
||||
paddw mm1, mm4 ; Second 4 values
|
||||
|
||||
// Pack and store
|
||||
mov ecx,dword ptr [TmpDataPtr]
|
||||
packuswb mm0, mm1 ; Then pack and saturate to unsigned bytes
|
||||
movq dword ptr [ecx+56],mm0 ; Write the data out to the temporary results buffer
|
||||
|
||||
|
||||
// Now copy the results back to the reconstruction buffer.
|
||||
mov eax,dword ptr [ReconPtr] ; Load the reconstruction Pointer
|
||||
mov ecx,dword ptr [TmpDataPtr] ; Load the temp results pointer
|
||||
// Row 1
|
||||
movq mm0,dword ptr [ecx] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 2
|
||||
movq mm0,dword ptr [ecx+8] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 3
|
||||
movq mm0,dword ptr [ecx+16] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 4
|
||||
movq mm0,dword ptr [ecx+24] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 5
|
||||
movq mm0,dword ptr [ecx+32] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 6
|
||||
movq mm0,dword ptr [ecx+40] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 7
|
||||
movq mm0,dword ptr [ecx+48] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
// Row 8
|
||||
movq mm0,dword ptr [ecx+56] ; Load 8 elements of results data
|
||||
movq dword ptr [eax],mm0 ; Write the data tot he reconstruction buffer.
|
||||
add eax,edx ; Step the reconstruction pointer
|
||||
|
||||
//emms
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
351
Src/libvpShared/corelibs/cdxv/vputil/win32/uoptsystemdependant.c
Normal file
351
Src/libvpShared/corelibs/cdxv/vputil/win32/uoptsystemdependant.c
Normal file
@@ -0,0 +1,351 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : SystemDependant.c
|
||||
*
|
||||
* Description : Miscellaneous system dependant functions
|
||||
*
|
||||
* AUTHOR : Paul Wilkins
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
* 1.20 YWX 06-Nov-02 Added forward DCT function optimized for Pentium 4
|
||||
* 1.19 YWX 15-Jun-01 added function pointer setups for new deblocking filter
|
||||
* 1.18 YWX 26-Apr-01 Fixed the cpu frequency detection bug caused by Sleep()
|
||||
* 1.17 JBX 22-Mar-01 Merged with new vp4-mapca bitstream
|
||||
* 1.16 JBB 26-Jan-01 Cleaned out unused function
|
||||
* 1.15 YWX 08-dec-00 Added WMT PostProcessor and
|
||||
* moved function declarations into _head files
|
||||
* 1.14 JBB 30 NOV 00 Version number changes
|
||||
* 1.13 YWX 03-Nov-00 Optimized postprocessor filters
|
||||
* 1.12 YWX 02-Nov-00 Added new loopfilter function pointers
|
||||
* 1.11 YWX 19-Oct-00 Added 1-2 Scaling functions pointers
|
||||
* 1.10 jbb 16 oct 00 added ifdefs to insure version code
|
||||
* 1.09 YWX 04-Oct-00 Added function pointers for scaling
|
||||
* 1.08 YWX 06 Sep 00 Added function pointers for new deringing filter
|
||||
* using frag baseed Q Value.
|
||||
* 1.07 JBB 21 Aug 00 New More Blurry in high variance area deringer
|
||||
* 1.06 YWX 2 Aug 00 Added function pointers for postprocess
|
||||
* 1.05 YWX 15/05/00 Added functions to check processor frequency
|
||||
* and more function pointers for postprocessor
|
||||
* 1.04 YWX 08/05/00 Added function pointers setup for postprocess
|
||||
* 1.03 SJL 20/04/00 Added ability to enable the new dequant code.
|
||||
* 1.02 SJL 22/03/00 Function pointers for the loop filter.
|
||||
* 1.01 JBB 21/03/00 More Function Pointers for optimized playback
|
||||
* 1.00 PGW 12/10/99 Configuration baseline
|
||||
*
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Header Files
|
||||
*****************************************************************************
|
||||
*/
|
||||
#include "codec_common.h"
|
||||
#include "vputil_if.h"
|
||||
#include "cpuidlib.h"
|
||||
|
||||
//global debugging aid's!
|
||||
int fastIDCTDisabled = 0;
|
||||
int forceCPUID = 0;
|
||||
int CPUID = 0;
|
||||
|
||||
|
||||
extern void GetProcessorFlags(INT32 *MmxEnabled, INT32 *XmmEnabled, INT32 *WmtEnabled);
|
||||
|
||||
// Scalar (no mmx) reconstruction functions
|
||||
extern void ClearSysState_C(void);
|
||||
extern void IDctSlow( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void IDct10( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void IDct1( INT16 * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void ScalarReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void ScalarReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void ScalarReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void ReconBlock_C(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
|
||||
extern void SubtractBlock_C( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
|
||||
extern void UnpackBlock_C( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
|
||||
extern void AverageBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
|
||||
extern void CopyBlock_C(unsigned char *src, unsigned char *dest, unsigned int srcstride);
|
||||
extern void Copy12x12_C(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
|
||||
extern void fdct_short_C ( INT16 * InputData, INT16 * OutputData );
|
||||
extern void FilterBlockBil_8_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
|
||||
extern void FilterBlock_C( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
|
||||
|
||||
// MMx versions
|
||||
extern void fdct_MMX ( INT16 * InputData, INT16 * OutputData );
|
||||
extern void ClearMmx(void);
|
||||
extern void MMXReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void MmxReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void MmxReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void MMX_idct( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void MMX_idct10( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void MMX_idct1( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void MMX_idct_DX( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void MMX_idct10_DX( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void ReconBlock_MMX(INT16 *SrcBlock,INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep);
|
||||
extern void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep );
|
||||
extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
|
||||
extern void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);
|
||||
extern void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride);
|
||||
extern void Copy12x12_MMX(const unsigned char *src, unsigned char *dest, unsigned int srcstride, unsigned int deststride);
|
||||
extern void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
|
||||
extern void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
|
||||
|
||||
// WMT versions
|
||||
extern void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr1, UINT8 * RefPtr2, INT16 * ChangePtr, UINT32 LineStep );
|
||||
extern void Wmt_idct1( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void Wmt_IDct_Dx( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void Wmt_IDct10_Dx( Q_LIST_ENTRY * InputData, INT16 *QuantMatrix, INT16 * OutputData );
|
||||
extern void fdct_WMT(short *InputData, short *OutputData);
|
||||
extern void FilterBlockBil_8_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 ReconPixelsPerLine, INT32 ModX, INT32 ModY );
|
||||
extern void FilterBlock_wmt( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha );
|
||||
|
||||
|
||||
#define IdctAdjustBeforeShift 8
|
||||
extern UINT16 idctconstants[(4+7+1) * 4];
|
||||
extern UINT16 idctcosTbl[ 7];
|
||||
|
||||
void fillidctconstants(void)
|
||||
{
|
||||
int j = 16;
|
||||
UINT16 * p;
|
||||
do
|
||||
{
|
||||
idctconstants[ --j] = 0;
|
||||
}
|
||||
while( j);
|
||||
|
||||
idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
|
||||
|
||||
j = 1;
|
||||
do
|
||||
{
|
||||
p = idctconstants + ( (j+3) << 2);
|
||||
p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
|
||||
}
|
||||
while( ++j <= 7);
|
||||
|
||||
idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : Get Processor Flags
|
||||
*
|
||||
* INPUTS : None
|
||||
*
|
||||
* OUTPUTS : None
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Checks for machine specifc features such as MMX support
|
||||
* sets approipriate flags and function pointers.
|
||||
*
|
||||
* SPECIAL NOTES : None.
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void GetProcessorFlags
|
||||
(
|
||||
INT32 *MmxEnabled,
|
||||
INT32 *XmmEnabled,
|
||||
INT32 *WmtEnabled
|
||||
)
|
||||
{
|
||||
|
||||
PROCTYPE CPUType = findCPUId();
|
||||
if(forceCPUID)
|
||||
CPUType = CPUID;
|
||||
|
||||
switch(CPUType)
|
||||
{
|
||||
case X86 :
|
||||
case PPRO :
|
||||
case C6X86 :
|
||||
case C6X86MX:
|
||||
case AMDK5 :
|
||||
case MACG3 :
|
||||
case MAC68K :
|
||||
*MmxEnabled = FALSE;
|
||||
*XmmEnabled = FALSE;
|
||||
*WmtEnabled = FALSE;
|
||||
break;
|
||||
case PII :
|
||||
case AMDK63D:
|
||||
case AMDK6 :
|
||||
case PMMX :
|
||||
*MmxEnabled = TRUE;
|
||||
*XmmEnabled = FALSE;
|
||||
*WmtEnabled = FALSE;
|
||||
break;
|
||||
case XMM :
|
||||
*MmxEnabled = TRUE;
|
||||
*XmmEnabled = TRUE;
|
||||
*WmtEnabled = FALSE;
|
||||
break;
|
||||
case WMT :
|
||||
*MmxEnabled = TRUE;
|
||||
*XmmEnabled = TRUE;
|
||||
*WmtEnabled = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : MachineSpecificConfig
|
||||
*
|
||||
* INPUTS : None
|
||||
*
|
||||
* OUTPUTS : None
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Checks for machine specifc features such as MMX support
|
||||
* sets approipriate flags and function pointers.
|
||||
*
|
||||
* SPECIAL NOTES : None.
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void UtilMachineSpecificConfig
|
||||
(
|
||||
void
|
||||
)
|
||||
{
|
||||
UINT32 i;
|
||||
INT32 MmxEnabled;
|
||||
INT32 XmmEnabled;
|
||||
INT32 WmtEnabled;
|
||||
|
||||
GetProcessorFlags( &MmxEnabled,&XmmEnabled,&WmtEnabled);
|
||||
|
||||
if(WmtEnabled) //Willamette
|
||||
{
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
|
||||
if(fastIDCTDisabled)
|
||||
idct[i]=Wmt_IDct_Dx;
|
||||
else
|
||||
{
|
||||
if(i<=1)idct[i]=Wmt_idct1;
|
||||
else if(i<=10)idct[i]=Wmt_IDct10_Dx;
|
||||
else idct[i]=Wmt_IDct_Dx;
|
||||
}
|
||||
}
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
if(fastIDCTDisabled)
|
||||
idctc[i]=MMX_idct;
|
||||
else
|
||||
{
|
||||
if(i<=1)idctc[i]=Wmt_idct1;
|
||||
else if(i<=10)idctc[i]=MMX_idct10;
|
||||
else idctc[i]=MMX_idct;
|
||||
}
|
||||
}
|
||||
fdct_short=fdct_WMT;
|
||||
|
||||
ReconIntra = WmtReconIntra;
|
||||
ReconInter = WmtReconInter;
|
||||
ReconInterHalfPixel2 = WmtReconInterHalfPixel2;
|
||||
ClearSysState = ClearMmx;
|
||||
AverageBlock = AverageBlock_MMX;
|
||||
UnpackBlock = UnpackBlock_MMX;
|
||||
ReconBlock = ReconBlock_MMX;
|
||||
SubtractBlock = SubtractBlock_MMX;
|
||||
CopyBlock = CopyBlockMMX;
|
||||
Copy12x12 = Copy12x12_MMX;
|
||||
FilterBlockBil_8 = FilterBlockBil_8_wmt;
|
||||
FilterBlock=FilterBlock_wmt;
|
||||
//FilterBlock=FilterBlock_C;
|
||||
}
|
||||
else if ( MmxEnabled )
|
||||
{
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
if(fastIDCTDisabled)
|
||||
idctc[i]=MMX_idct_DX;
|
||||
else
|
||||
{
|
||||
if(i<=1)idctc[i]=MMX_idct1;
|
||||
else if(i<=10)idctc[i]=MMX_idct10;
|
||||
else idctc[i]=MMX_idct;
|
||||
}
|
||||
}
|
||||
fdct_short=fdct_MMX;
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
if(fastIDCTDisabled)
|
||||
idct[i]=MMX_idct_DX;
|
||||
else
|
||||
{
|
||||
if(i<=1)idct[i]=MMX_idct1;
|
||||
else if(i<=10)idct[i]=MMX_idct10_DX;
|
||||
else idct[i]=MMX_idct_DX;
|
||||
}
|
||||
}
|
||||
|
||||
ReconIntra = MMXReconIntra;
|
||||
ReconInter = MmxReconInter;
|
||||
ReconInterHalfPixel2 = MmxReconInterHalfPixel2;
|
||||
ClearSysState = ClearMmx;
|
||||
AverageBlock = AverageBlock_MMX;
|
||||
UnpackBlock = UnpackBlock_MMX;
|
||||
ReconBlock = ReconBlock_MMX;
|
||||
SubtractBlock = SubtractBlock_MMX;
|
||||
CopyBlock = CopyBlockMMX;
|
||||
Copy12x12 = Copy12x12_MMX;
|
||||
FilterBlockBil_8 = FilterBlockBil_8_mmx;
|
||||
FilterBlock=FilterBlock_mmx;
|
||||
//FilterBlock=FilterBlock_C;
|
||||
}
|
||||
else
|
||||
{
|
||||
int i;
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
if(fastIDCTDisabled)
|
||||
idctc[i]=IDctSlow;
|
||||
else
|
||||
{
|
||||
if(i<=1)idctc[i]=IDct1;
|
||||
else if(i<=10)idctc[i]=IDct10;
|
||||
else idctc[i]=IDctSlow;
|
||||
}
|
||||
}
|
||||
fdct_short=fdct_short_C ;
|
||||
for(i=0;i<=64;i++)
|
||||
{
|
||||
if(fastIDCTDisabled)
|
||||
idct[i]=IDctSlow;
|
||||
else
|
||||
{
|
||||
if(i<=1)idct[i]=IDct1;
|
||||
else if(i<=10)idct[i]=IDct10;
|
||||
else idct[i]=IDctSlow;
|
||||
}
|
||||
}
|
||||
ClearSysState = ClearSysState_C;
|
||||
ReconIntra = ScalarReconIntra;
|
||||
ReconInter = ScalarReconInter;
|
||||
ReconInterHalfPixel2 = ScalarReconInterHalfPixel2;
|
||||
AverageBlock = AverageBlock_C;
|
||||
UnpackBlock = UnpackBlock_C;
|
||||
ReconBlock = ReconBlock_C;
|
||||
SubtractBlock = SubtractBlock_C;
|
||||
CopyBlock = CopyBlock_C;
|
||||
Copy12x12 = Copy12x12_MMX;
|
||||
FilterBlockBil_8 = FilterBlockBil_8_C;
|
||||
FilterBlock=FilterBlock_C;
|
||||
}
|
||||
//FilterBlock=FilterBlock_C;
|
||||
|
||||
}
|
||||
507
Src/libvpShared/corelibs/cdxv/vputil/win32/vputilasm.c
Normal file
507
Src/libvpShared/corelibs/cdxv/vputil/win32/vputilasm.c
Normal file
@@ -0,0 +1,507 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : newLoopTest_asm.c
|
||||
*
|
||||
* Description : Codec specific functions
|
||||
*
|
||||
* AUTHOR : Yaowu Xu
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
* 1.02 YWX 03-Nov-00 Changed confusing variable name
|
||||
* 1.01 YWX 02-Nov-00 Added the set of functions
|
||||
* 1.00 YWX 19-Oct-00 configuration baseline
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Header Frames
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#define STRICT /* Strict type checking. */
|
||||
#include "codec_common.h"
|
||||
#include <math.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Module constants.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Explicit Imports
|
||||
*****************************************************************************
|
||||
*/
|
||||
extern void SatUnsigned8( UINT8 * ResultPtr, INT16 * DataBlock,
|
||||
UINT32 ResultLineStep, UINT32 DataLineStep );
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Global Variables
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Functions
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Module Statics
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Foreward References
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : ClearMmx()
|
||||
*
|
||||
*
|
||||
* INPUTS : None
|
||||
*
|
||||
* OUTPUTS :
|
||||
*
|
||||
* RETURNS :
|
||||
*
|
||||
*
|
||||
* FUNCTION : Clears down the MMX state
|
||||
*
|
||||
* SPECIAL NOTES : None.
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void ClearMmx(void)
|
||||
{
|
||||
__asm
|
||||
{
|
||||
emms ; Clear the MMX state.
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : CopyBlockUsingMMX
|
||||
*
|
||||
* INPUTS : None
|
||||
*
|
||||
* OUTPUTS : None
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Copies a block from source to destination
|
||||
*
|
||||
* SPECIAL NOTES : None.
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void CopyBlockMMX(unsigned char *src, unsigned char *dest, unsigned int srcstride)
|
||||
{
|
||||
unsigned char *s = src;
|
||||
unsigned char *d = dest;
|
||||
unsigned int stride = srcstride;
|
||||
// recon copy
|
||||
_asm
|
||||
{
|
||||
mov ecx, [stride]
|
||||
mov eax, [s]
|
||||
mov ebx, [d]
|
||||
lea edx, [ecx + ecx * 2]
|
||||
|
||||
movq mm0, [eax]
|
||||
movq mm1, [eax + ecx]
|
||||
movq mm2, [eax + ecx*2]
|
||||
movq mm3, [eax + edx]
|
||||
|
||||
lea eax, [eax + ecx*4]
|
||||
|
||||
movq [ebx], mm0
|
||||
movq [ebx + ecx], mm1
|
||||
movq [ebx + ecx*2], mm2
|
||||
movq [ebx + edx], mm3
|
||||
|
||||
lea ebx, [ebx + ecx * 4]
|
||||
|
||||
movq mm0, [eax]
|
||||
movq mm1, [eax + ecx]
|
||||
movq mm2, [eax + ecx*2]
|
||||
movq mm3, [eax + edx]
|
||||
|
||||
movq [ebx], mm0
|
||||
movq [ebx + ecx], mm1
|
||||
movq [ebx + ecx*2], mm2
|
||||
movq [ebx + edx], mm3
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : CopyBlockUsingMMX
|
||||
*
|
||||
* INPUTS : None
|
||||
*
|
||||
* OUTPUTS : None
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Copies a block from source to destination
|
||||
*
|
||||
* SPECIAL NOTES : None.
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void Copy12x12_MMX(
|
||||
const unsigned char *src,
|
||||
unsigned char *dest,
|
||||
unsigned int srcstride,
|
||||
unsigned int deststride)
|
||||
{
|
||||
|
||||
|
||||
int j=0;
|
||||
do
|
||||
{
|
||||
((UINT32*)dest)[0] = ((UINT32*)src)[0];
|
||||
((UINT32*)dest)[1] = ((UINT32*)src)[1];
|
||||
((UINT32*)dest)[2] = ((UINT32*)src)[2];
|
||||
src+=srcstride;
|
||||
dest+=deststride;
|
||||
}
|
||||
while(++j<12);
|
||||
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : AverageBlock_MMX
|
||||
*
|
||||
* INPUTS : Two block data to be averaged
|
||||
*
|
||||
* OUTPUTS : block with the average values
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Do pixel averages on two reference blocks
|
||||
*
|
||||
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void AverageBlock_MMX( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
mov esi, ReconPtr1
|
||||
mov eax, ReconPtr2
|
||||
|
||||
mov edi, ReconRefPtr
|
||||
mov ecx, BLOCK_HEIGHT_WIDTH
|
||||
|
||||
mov edx, ReconPixelsPerLine
|
||||
pxor mm7, mm7
|
||||
|
||||
AverageBlock_Loop:
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm1, [eax]
|
||||
|
||||
movq mm2, mm0
|
||||
punpcklbw mm0, mm7
|
||||
|
||||
movq mm3, mm1
|
||||
punpcklbw mm1, mm7
|
||||
|
||||
paddw mm0, mm1
|
||||
punpckhbw mm2, mm7
|
||||
|
||||
psraw mm0, 1
|
||||
punpckhbw mm3, mm7
|
||||
|
||||
paddw mm2, mm3
|
||||
movq [edi], mm0
|
||||
|
||||
psraw mm2, 1
|
||||
add esi, edx
|
||||
|
||||
add eax, edx
|
||||
add edi, 16
|
||||
|
||||
movq [edi-8], mm2
|
||||
dec ecx
|
||||
|
||||
jnz AverageBlock_Loop
|
||||
}
|
||||
/*
|
||||
UINT32 i;
|
||||
|
||||
// For each block row
|
||||
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
|
||||
{
|
||||
ReconRefPtr[0] = (INT16)((INT32)(ReconPtr1[0])+ ((INT32)ReconPtr2[0]))>>1;
|
||||
ReconRefPtr[1] = (INT16)((INT32)(ReconPtr1[1])+ ((INT32)ReconPtr2[1]))>>1;
|
||||
ReconRefPtr[2] = (INT16)((INT32)(ReconPtr1[2])+ ((INT32)ReconPtr2[2]))>>1;
|
||||
ReconRefPtr[3] = (INT16)((INT32)(ReconPtr1[3])+ ((INT32)ReconPtr2[3]))>>1;
|
||||
ReconRefPtr[4] = (INT16)((INT32)(ReconPtr1[4])+ ((INT32)ReconPtr2[4]))>>1;
|
||||
ReconRefPtr[5] = (INT16)((INT32)(ReconPtr1[5])+ ((INT32)ReconPtr2[5]))>>1;
|
||||
ReconRefPtr[6] = (INT16)((INT32)(ReconPtr1[6])+ ((INT32)ReconPtr2[6]))>>1;
|
||||
ReconRefPtr[7] = (INT16)((INT32)(ReconPtr1[7])+ ((INT32)ReconPtr2[7]))>>1;
|
||||
|
||||
// Start next row
|
||||
ReconPtr1 += ReconPixelsPerLine;
|
||||
ReconPtr2 += ReconPixelsPerLine;
|
||||
|
||||
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : UnpackBlock
|
||||
*
|
||||
* INPUTS : Block of char data to be converted to short
|
||||
*
|
||||
* OUTPUTS : converted output
|
||||
*
|
||||
* RETURNS : None.
|
||||
*
|
||||
* FUNCTION : Converted char block data to short
|
||||
*
|
||||
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine)
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
mov esi, ReconPtr
|
||||
mov edi, ReconRefPtr
|
||||
|
||||
mov ecx, BLOCK_HEIGHT_WIDTH
|
||||
mov edx, ReconPixelsPerLine
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
UnpackBlock_Loop:
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm2, mm0
|
||||
|
||||
punpcklbw mm0, mm7
|
||||
movq [edi], mm0
|
||||
|
||||
punpckhbw mm2, mm7
|
||||
add esi, edx
|
||||
|
||||
movq [edi+8], mm2
|
||||
add edi, 16
|
||||
|
||||
dec ecx
|
||||
jnz UnpackBlock_Loop
|
||||
}
|
||||
|
||||
/*
|
||||
UINT32 i;
|
||||
|
||||
// For each block row
|
||||
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
|
||||
{
|
||||
|
||||
ReconRefPtr[0] = (INT16)(ReconPtr[0]);
|
||||
ReconRefPtr[1] = (INT16)(ReconPtr[1]);
|
||||
ReconRefPtr[2] = (INT16)(ReconPtr[2]);
|
||||
ReconRefPtr[3] = (INT16)(ReconPtr[3]);
|
||||
ReconRefPtr[4] = (INT16)(ReconPtr[4]);
|
||||
ReconRefPtr[5] = (INT16)(ReconPtr[5]);
|
||||
ReconRefPtr[6] = (INT16)(ReconPtr[6]);
|
||||
ReconRefPtr[7] = (INT16)(ReconPtr[7]);
|
||||
|
||||
// Start next row
|
||||
ReconPtr += ReconPixelsPerLine;
|
||||
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : SubtractBlock
|
||||
*
|
||||
* INPUTS : Get the residue data for the block
|
||||
*
|
||||
* OUTPUTS : Source block data and ref block data
|
||||
*
|
||||
* RETURNS : residue block data
|
||||
*
|
||||
* FUNCTION : do pixel subtraction of ref block from source block
|
||||
*
|
||||
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void SubtractBlock_MMX( UINT8 *SrcBlock, INT16 *DestPtr, UINT32 LineStep )
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
|
||||
mov esi, SrcBlock
|
||||
mov edi, DestPtr
|
||||
|
||||
mov edx, LineStep
|
||||
mov ecx, 8
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
SubtractBlock_Loop:
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm1, [edi]
|
||||
|
||||
movq mm2, mm0
|
||||
punpcklbw mm0, mm7
|
||||
|
||||
movq mm3, [edi+8]
|
||||
psubw mm0, mm1
|
||||
|
||||
punpckhbw mm2, mm7
|
||||
movq [edi], mm0
|
||||
|
||||
psubw mm2, mm3
|
||||
add esi, edx
|
||||
|
||||
movq [edi+8], mm2
|
||||
add edi, 16
|
||||
|
||||
dec ecx
|
||||
jnz SubtractBlock_Loop
|
||||
}
|
||||
|
||||
/*
|
||||
UINT32 i;
|
||||
|
||||
// For each block row
|
||||
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
|
||||
{
|
||||
|
||||
DestPtr[0] = (INT16)((INT32)SrcBlock[0] - (INT32)DestPtr[0]);
|
||||
DestPtr[1] = (INT16)((INT32)SrcBlock[1] - (INT32)DestPtr[1]);
|
||||
DestPtr[2] = (INT16)((INT32)SrcBlock[2] - (INT32)DestPtr[2]);
|
||||
DestPtr[3] = (INT16)((INT32)SrcBlock[3] - (INT32)DestPtr[3]);
|
||||
DestPtr[4] = (INT16)((INT32)SrcBlock[4] - (INT32)DestPtr[4]);
|
||||
DestPtr[5] = (INT16)((INT32)SrcBlock[5] - (INT32)DestPtr[5]);
|
||||
DestPtr[6] = (INT16)((INT32)SrcBlock[6] - (INT32)DestPtr[6]);
|
||||
DestPtr[7] = (INT16)((INT32)SrcBlock[7] - (INT32)DestPtr[7]);
|
||||
|
||||
// Start next row
|
||||
SrcBlock += LineStep;
|
||||
DestPtr += BLOCK_HEIGHT_WIDTH;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : ReconBlock
|
||||
*
|
||||
* INPUTS :
|
||||
*
|
||||
* OUTPUTS :
|
||||
*
|
||||
* RETURNS :
|
||||
*
|
||||
* FUNCTION : Reconstrut a block using ref blocka and change data
|
||||
*
|
||||
* SPECIAL NOTES : This functions has a mmx version in newlooptest_asm.c
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void ReconBlock_MMX( INT16 *SrcBlock, INT16 *ReconRefPtr, UINT8 *DestBlock, UINT32 LineStep)
|
||||
{
|
||||
|
||||
__asm
|
||||
{
|
||||
|
||||
mov esi, SrcBlock
|
||||
mov eax, ReconRefPtr
|
||||
|
||||
mov edi, DestBlock
|
||||
mov ecx, 8
|
||||
|
||||
mov edx, LineStep
|
||||
pxor mm7, mm7
|
||||
|
||||
ReconBlock_Loop:
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm1, [eax]
|
||||
|
||||
movq mm2, [esi+8]
|
||||
movq mm3, [eax+8]
|
||||
|
||||
paddw mm0, mm1
|
||||
paddw mm2, mm3
|
||||
|
||||
packuswb mm0, mm2
|
||||
movq [edi], mm0
|
||||
|
||||
add esi, 16
|
||||
add eax, 16
|
||||
|
||||
add edi, edx
|
||||
dec ecx
|
||||
|
||||
jnz ReconBlock_Loop
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
UINT32 i;
|
||||
INT16 *SrcBlockPtr = SrcBlock;
|
||||
|
||||
// For each block row
|
||||
for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ )
|
||||
{
|
||||
SrcBlock[0] += ReconRefPtr[0];
|
||||
SrcBlock[1] += ReconRefPtr[1];
|
||||
SrcBlock[2] += ReconRefPtr[2];
|
||||
SrcBlock[3] += ReconRefPtr[3];
|
||||
SrcBlock[4] += ReconRefPtr[4];
|
||||
SrcBlock[5] += ReconRefPtr[5];
|
||||
SrcBlock[6] += ReconRefPtr[6];
|
||||
SrcBlock[7] += ReconRefPtr[7];
|
||||
|
||||
// Start next row
|
||||
SrcBlock += BLOCK_HEIGHT_WIDTH;
|
||||
ReconRefPtr += BLOCK_HEIGHT_WIDTH;
|
||||
}
|
||||
// Saturated the block and write to the output
|
||||
SatUnsigned8( DestBlock, SrcBlockPtr, LineStep, BLOCK_HEIGHT_WIDTH );
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
1859
Src/libvpShared/corelibs/cdxv/vputil/win32/wmtidct.c
Normal file
1859
Src/libvpShared/corelibs/cdxv/vputil/win32/wmtidct.c
Normal file
File diff suppressed because it is too large
Load Diff
281
Src/libvpShared/corelibs/cdxv/vputil/win32/wmtrecon.c
Normal file
281
Src/libvpShared/corelibs/cdxv/vputil/win32/wmtrecon.c
Normal file
@@ -0,0 +1,281 @@
|
||||
/****************************************************************************
|
||||
*
|
||||
* Module Title : WmtOptFunctions.c
|
||||
*
|
||||
* Description : willamette processor specific
|
||||
* optimised versions of functions
|
||||
*
|
||||
* AUTHOR : Yaowu Xu
|
||||
*
|
||||
* Special Note:
|
||||
*
|
||||
*****************************************************************************
|
||||
* Revision History
|
||||
*
|
||||
*
|
||||
* 1.03 YWX 07-Dec-00 Removed constants and functions that are not in use
|
||||
* Added push and pop ebx in WmtReconIntra
|
||||
* 1.02 YWX 30 Aug 00 changed to be compatible with Microsoft compiler
|
||||
* 1.01 YWX 13 JUL 00 New Willamette Optimized Functions
|
||||
* 1.00 YWX 14/06/00 Configuration baseline from OptFunctions.c
|
||||
*
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/*
|
||||
Use Tim's optimized version.
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Header Files
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
#define STRICT // Strict type checking.
|
||||
|
||||
#include "reconstruct.h"
|
||||
|
||||
/****************************************************************************
|
||||
* Module constants.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Imports.
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Global Variables
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Exported Functions
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
/****************************************************************************
|
||||
* Module Statics
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
|
||||
_declspec(align(16)) static UINT8 Eight128s[8] = {128,128,128,128,128,128,128,128};
|
||||
|
||||
#pragma warning( disable : 4799 ) // Disable no emms instruction warning!
|
||||
|
||||
/****************************************************************************
|
||||
* Forward References
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : WmtReconIntra
|
||||
*
|
||||
* INPUTS : INT16 * idct
|
||||
* Pointer to the output from the idct for this block
|
||||
*
|
||||
* UINT32 stride
|
||||
* Line Length in pixels in recon and reference images
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* OUTPUTS : UINT8 * dest
|
||||
* The reconstruction buffer
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs an intra block - wmt version
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void WmtReconIntra( INT16 *TmpDataBuffer, UINT8 * dest, UINT16 * idct, UINT32 stride )
|
||||
{
|
||||
(void)TmpDataBuffer;
|
||||
__asm
|
||||
{
|
||||
|
||||
push ebx
|
||||
|
||||
mov eax,[idct] ; Signed 16 bit inputs
|
||||
mov edx,[dest] ; Unsigned 8 bit outputs
|
||||
|
||||
movq xmm0,QWORD PTR [Eight128s] ; Set xmm0 to 0x000000000000008080808080808080
|
||||
pxor xmm3, xmm3 ; set xmm3 to 0
|
||||
;
|
||||
mov ebx,[stride] ; Line stride in output buffer
|
||||
lea ecx,[eax+128] ; Endpoint in input buffer
|
||||
|
||||
loop_label:
|
||||
|
||||
movdqa xmm2,XMMWORD PTR [eax] ; Read the eight inputs
|
||||
packsswb xmm2,xmm3 ;
|
||||
|
||||
pxor xmm2,xmm0 ; Convert result to unsigned (same as add 128)
|
||||
lea eax,[eax + 16] ; Step source buffer
|
||||
|
||||
cmp eax,ecx ; are we done
|
||||
movq QWORD PTR [edx],xmm2 ; store results
|
||||
|
||||
lea edx,[edx+ebx] ; Step output buffer
|
||||
jc loop_label ; Loop back if we are not done
|
||||
|
||||
pop ebx
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : WmtReconInter
|
||||
*
|
||||
* INPUTS : UINT8 * RefPtr
|
||||
* The last frame reference
|
||||
*
|
||||
* INT16 * ChangePtr
|
||||
* Pointer to the change data
|
||||
*
|
||||
* UINT32 LineStep
|
||||
* Line Length in pixels in recon and ref images
|
||||
*
|
||||
* OUTPUTS : UINT8 * ReconPtr
|
||||
* The reconstruction
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs data from last data and change
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
void WmtReconInter( INT16 *TmpDataBuffer, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
(void) TmpDataBuffer;
|
||||
|
||||
_asm {
|
||||
push edi
|
||||
|
||||
mov ebx, [RefPtr]
|
||||
mov ecx, [ChangePtr]
|
||||
|
||||
mov eax, [ReconPtr]
|
||||
mov edx, [LineStep]
|
||||
|
||||
pxor xmm0, xmm0
|
||||
lea edi, [ecx + 128]
|
||||
L:
|
||||
movq xmm2, QWORD ptr [ebx] ; (+3 misaligned) 8 reference pixels
|
||||
movdqa xmm4, XMMWORD ptr [ecx] ; 8 changes
|
||||
|
||||
punpcklbw xmm2, xmm0 ;
|
||||
|
||||
add ebx, edx ; next row of reference pixels
|
||||
paddsw xmm2, xmm4 ; add in first 4 changes
|
||||
|
||||
lea ecx, [ecx + 16] ; next row of changes
|
||||
packuswb xmm2, xmm0 ; pack result to unsigned 8-bit values
|
||||
|
||||
cmp ecx, edi ; are we done?
|
||||
movq QWORD PTR [eax], xmm2 ; store result
|
||||
|
||||
lea eax, [eax+edx] ; next row of output
|
||||
jc L ; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix
|
||||
|
||||
pop edi
|
||||
}
|
||||
|
||||
}
|
||||
/****************************************************************************
|
||||
*
|
||||
* ROUTINE : WmtReconInterHalfPixel2
|
||||
*
|
||||
* INPUTS : UINT8 * RefPtr1, RefPtr2
|
||||
* The last frame reference
|
||||
*
|
||||
* INT16 * ChangePtr
|
||||
* Pointer to the change data
|
||||
*
|
||||
* UINT32 LineStep
|
||||
* Line Length in pixels in recon and ref images
|
||||
*
|
||||
*
|
||||
* OUTPUTS : UINT8 * ReconPtr
|
||||
* The reconstruction
|
||||
*
|
||||
* RETURNS : None
|
||||
*
|
||||
* FUNCTION : Reconstructs data from half pixel reference data and change.
|
||||
* Half pixel data interpolated from 2 references.
|
||||
*
|
||||
* SPECIAL NOTES :
|
||||
*
|
||||
*
|
||||
* ERRORS : None.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
void WmtReconInterHalfPixel2( INT16 *TmpDataBuffer, UINT8 * ReconPtr,
|
||||
UINT8 * RefPtr1, UINT8 * RefPtr2,
|
||||
INT16 * ChangePtr, UINT32 LineStep )
|
||||
{
|
||||
(void)TmpDataBuffer;
|
||||
|
||||
_asm {
|
||||
push esi
|
||||
push edi
|
||||
|
||||
mov ecx, [ChangePtr]
|
||||
mov esi, [RefPtr1]
|
||||
|
||||
mov edi, [RefPtr2]
|
||||
mov ebx, [ReconPtr]
|
||||
|
||||
mov edx, [LineStep]
|
||||
lea eax, [ecx+128]
|
||||
|
||||
pxor xmm0, xmm0
|
||||
|
||||
L:
|
||||
|
||||
movq xmm2, QWORD PTR [esi] ; (+3 misaligned) mm2 = row from ref1
|
||||
movq xmm4, QWORD PTR [edi] ; (+3 misaligned) mm4 = row from ref2
|
||||
|
||||
punpcklbw xmm2, xmm0 ;
|
||||
punpcklbw xmm4, xmm0 ;
|
||||
|
||||
movdqa xmm6, [ecx] ; mm6 = first 4 changes
|
||||
paddw xmm2, xmm4 ; mm2 = start (ref1 + ref2)
|
||||
|
||||
|
||||
psrlw xmm2, 1 ; mm2 = start (ref1 + ref2)/2
|
||||
paddw xmm2, xmm6 ; add changes to start
|
||||
|
||||
lea ecx, [ecx+16] ; next row idct
|
||||
packuswb xmm2, xmm0 ; pack start|end to unsigned 8-bit
|
||||
|
||||
add esi, edx ; next row ref1
|
||||
add edi, edx ; next row ref2
|
||||
|
||||
cmp ecx, eax
|
||||
movq QWORD PTR [ebx], xmm2 ; store result
|
||||
;
|
||||
lea ebx, [ebx+edx]
|
||||
jc L
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user