PC Welt 2004 March

home *** CD-ROM | disk | FTP | other *** search

/ PC Welt 2004 March / PCWELT_3_2004.ISO / pcwsoft / flaskmpeg_078_39_src.z.exe / flaskmpeg / Video / recon.c < prev next >

Wrap

C/C++ Source or Header | 2002-10-28 | 50.4 KB | 1,774 lines

/* Predict.c, motion compensation routines */ /* Last modified by PX3 July 30 2000 Optimizations done */ /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */ /* * Disclaimer of Warranty * * These software programs are available to the user without any license fee or * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims * any and all warranties, whether express, implied, or statuary, including any * implied warranties or merchantability or of fitness for a particular * purpose. In no event shall the copyright-holder be liable for any * incidental, punitive, or consequential damages of any kind whatsoever * arising from the use of these programs. * * This disclaimer of warranty extends to the user of these programs and user's * customers, employees, agents, transferees, successors, and assigns. * * The MPEG Software Simulation Group does not represent or warrant that the * programs furnished hereunder are free of infringement of any third-party * patents. * * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, * are subject to royalty fees to patent holders. Many of these patents are * general enough such that they are unavoidable regardless of implementation * design. * */ #include <stdio.h> #include "config.h" #include "global.h" //#include "getbits.h" /* private prototypes */ __inline static void form_prediction _ANSI_ARGS_((unsigned char *src[], int sfield, unsigned char *dst[], int dfield, int lx, int lx2, int w, int h, int x, int y, int dx, int dy, int average_flag)); __inline static void form_component_prediction _ANSI_ARGS_((unsigned char *src, unsigned char *dst, int lx, int lx2, int w, int h, int x, int y, int dx, int dy, int average_flag)); /* private data */ // 0.16B3, global scope declaration ensures proper data alignment static const __int64 mmMask0001= 0x0001000100010001; static const __int64 mmMask0002= 0x0002000200020002; static const __int64 mmMask0004= 0x0004000400040004; static const __int64 mmMask0101= 0x0101010101010101; static const __int64 mmMask00FF= 0x00FF00FF00FF00FF; static const __int64 mmMaskFFFF= 0xFFFFFFFFFFFFFFFF; static const __int64 mmMaskFEFE= 0xFEFEFEFEFEFEFEFE; static const __int64 mmMask9th = 0x00FF000000000000; void form_predictions(bx,by,macroblock_type,motion_type,PMV,motion_vertical_field_select,dmvector,stwtype) int bx, by; int macroblock_type; int motion_type; int PMV[2][2][2], motion_vertical_field_select[2][2], dmvector[2]; int stwtype; { static int currentfield; static unsigned char **predframe; static int DMV[2][2]; static int stwtop, stwbot; stwtop = stwtype%3; /* 0:temporal, 1:(spat+temp)/2, 2:spatial */ stwbot = stwtype/3; if ((macroblock_type & MACROBLOCK_MOTION_FORWARD) || (picture_coding_type==P_TYPE)) { if (picture_structure==FRAME_PICTURE) { if ((motion_type==MC_FRAME) || !(macroblock_type & MACROBLOCK_MOTION_FORWARD)) { /* frame-based prediction (broken into top and bottom halves for spatial scalability prediction purposes) */ if (stwtop<2) form_prediction(forward_reference_frame,0,current_frame,0, Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by, PMV[0][0][0],PMV[0][0][1],stwtop); if (stwbot<2) form_prediction(forward_reference_frame,1,current_frame,1, Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by, PMV[0][0][0],PMV[0][0][1],stwbot); } else if (motion_type==MC_FIELD) /* field-based prediction */ { /* top field prediction */ if (stwtop<2) form_prediction(forward_reference_frame,motion_vertical_field_select[0][0], current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by>>1,PMV[0][0][0],PMV[0][0][1]>>1,stwtop); /* bottom field prediction */ if (stwbot<2) form_prediction(forward_reference_frame,motion_vertical_field_select[1][0], current_frame,1,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by>>1,PMV[1][0][0],PMV[1][0][1]>>1,stwbot); } else if (motion_type==MC_DMV) /* dual prime prediction */ { /* calculate derived motion vectors */ Dual_Prime_Arithmetic(DMV,dmvector,PMV[0][0][0],PMV[0][0][1]>>1); if (stwtop<2) { /* predict top field from top field */ form_prediction(forward_reference_frame,0,current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1, PMV[0][0][0],PMV[0][0][1]>>1,0); /* predict and add to top field from bottom field */ form_prediction(forward_reference_frame,1,current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1, DMV[0][0],DMV[0][1],1); } if (stwbot<2) { /* predict bottom field from bottom field */ form_prediction(forward_reference_frame,1,current_frame,1, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1, PMV[0][0][0],PMV[0][0][1]>>1,0); /* predict and add to bottom field from top field */ form_prediction(forward_reference_frame,0,current_frame,1, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1, DMV[1][0],DMV[1][1],1); } } else /* invalid motion_type */ printf("invalid motion_type\n"); } else /* TOP_FIELD or BOTTOM_FIELD */ { /* field picture */ currentfield = (picture_structure==BOTTOM_FIELD); /* determine which frame to use for prediction */ if ((picture_coding_type==P_TYPE) && Second_Field && (currentfield!=motion_vertical_field_select[0][0])) predframe = backward_reference_frame; /* same frame */ else predframe = forward_reference_frame; /* previous frame */ if ((motion_type==MC_FIELD) || !(macroblock_type & MACROBLOCK_MOTION_FORWARD)) { /* field-based prediction */ if (stwtop<2) form_prediction(predframe,motion_vertical_field_select[0][0],current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by, PMV[0][0][0],PMV[0][0][1],stwtop); } else if (motion_type==MC_16X8) { if (stwtop<2) { form_prediction(predframe,motion_vertical_field_select[0][0],current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by, PMV[0][0][0],PMV[0][0][1],stwtop); /* determine which frame to use for lower half prediction */ if ((picture_coding_type==P_TYPE) && Second_Field && (currentfield!=motion_vertical_field_select[1][0])) predframe = backward_reference_frame; /* same frame */ else predframe = forward_reference_frame; /* previous frame */ form_prediction(predframe,motion_vertical_field_select[1][0],current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by+8, PMV[1][0][0],PMV[1][0][1],stwtop); } } else if (motion_type==MC_DMV) /* dual prime prediction */ { if (Second_Field) predframe = backward_reference_frame; /* same frame */ else predframe = forward_reference_frame; /* previous frame */ /* calculate derived motion vectors */ Dual_Prime_Arithmetic(DMV,dmvector,PMV[0][0][0],PMV[0][0][1]); /* predict from field of same parity */ form_prediction(forward_reference_frame,currentfield,current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by, PMV[0][0][0],PMV[0][0][1],0); /* predict from field of opposite parity */ form_prediction(predframe,!currentfield,current_frame,0, Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by, DMV[0][0],DMV[0][1],1); } else /* invalid motion_type */ printf("invalid motion_type\n"); } stwtop = stwbot = 1; } if (macroblock_type & MACROBLOCK_MOTION_BACKWARD) { if (picture_structure==FRAME_PICTURE) { if (motion_type==MC_FRAME) { /* frame-based prediction */ if (stwtop<2) form_prediction(backward_reference_frame,0,current_frame,0, Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by, PMV[0][1][0],PMV[0][1][1],stwtop); if (stwbot<2) form_prediction(backward_reference_frame,1,current_frame,1, Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by, PMV[0][1][0],PMV[0][1][1],stwbot); } else /* field-based prediction */ { /* top field prediction */ if (stwtop<2) form_prediction(backward_reference_frame,motion_vertical_field_select[0][1], current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by>>1,PMV[0][1][0],PMV[0][1][1]>>1,stwtop); /* bottom field prediction */ if (stwbot<2) form_prediction(backward_reference_frame,motion_vertical_field_select[1][1], current_frame,1,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by>>1,PMV[1][1][0],PMV[1][1][1]>>1,stwbot); } } else /* TOP_FIELD or BOTTOM_FIELD */ { /* field picture */ if (motion_type==MC_FIELD) { /* field-based prediction */ form_prediction(backward_reference_frame,motion_vertical_field_select[0][1], current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16, bx,by,PMV[0][1][0],PMV[0][1][1],stwtop); } else if (motion_type==MC_16X8) { form_prediction(backward_reference_frame,motion_vertical_field_select[0][1], current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by,PMV[0][1][0],PMV[0][1][1],stwtop); form_prediction(backward_reference_frame,motion_vertical_field_select[1][1], current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8, bx,by+8,PMV[1][1][0],PMV[1][1][1],stwtop); } else /* invalid motion_type */ printf("invalid motion_type\n"); } } } static void form_prediction(src,sfield,dst,dfield,lx,lx2,w,h,x,y,dx,dy,average_flag) unsigned char *src[]; /* prediction source buffer */ int sfield; /* prediction source field number (0 or 1) */ unsigned char *dst[]; /* prediction destination buffer */ int dfield; /* prediction destination field number (0 or 1)*/ int lx,lx2; /* line strides */ int w,h; /* prediction block/sub-block width, height */ int x,y; /* pixel co-ordinates of top-left sample in current MB */ int dx,dy; /* horizontal, vertical prediction address */ int average_flag; /* add prediction error to prediction ? */ { /* Y */ form_component_prediction(src[0]+(sfield?lx2>>1:0),dst[0]+(dfield?lx2>>1:0), lx,lx2,w,h,x,y,dx,dy,average_flag); /* if (chroma_format!=CHROMA444) { lx>>=1; lx2>>=1; w>>=1; x>>=1; dx/=2; } if (chroma_format==CHROMA420) { h>>=1; y>>=1; dy/=2; } */ if (chroma_format!=CHROMA444) { lx>>=1; lx2>>=1; w>>=1; x>>=1; dx/=2; if (chroma_format==CHROMA420) { h>>=1; y>>=1; dy/=2; } } /* Cb */ form_component_prediction(src[1]+(sfield?lx2>>1:0),dst[1]+(dfield?lx2>>1:0), lx,lx2,w,h,x,y,dx,dy,average_flag); /* Cr */ form_component_prediction(src[2]+(sfield?lx2>>1:0),dst[2]+(dfield?lx2>>1:0), lx,lx2,w,h,x,y,dx,dy,average_flag); } /* ISO/IEC 13818-2 section 7.6.4: Forming predictions */ /* NOTE: the arithmetic below produces numerically equivalent results * to 7.6.4, yet is more elegant. It differs in the following ways: * * 1. the vectors (dx, dy) are based on cartesian frame * coordiantes along a half-pel grid (always positive numbers) * In contrast, vector[r][s][t] are differential (with positive and * negative values). As a result, deriving the integer vectors * (int_vec[t]) from dx, dy is accomplished by a simple right shift. * * 2. Half pel flags (xh, yh) are equivalent to the LSB (Least * Significant Bit) of the half-pel coordinates (dx,dy). * * * NOTE: the work of combining predictions (ISO/IEC 13818-2 section 7.6.7) * is distributed among several other stages. This is accomplished by * folding line offsets into the source and destination (src,dst) * addresses (note the call arguments to form_prediction() in Predict()), * line stride variables lx and lx2, the block dimension variables (w,h), * average_flag, and by the very order in which Predict() is called. * This implementation design (implicitly different than the spec) * was chosen for its elegance. */ #define IS_XHALF ( dx & 0x01 ) #define IS_YHALF ( dy & 0x01 ) static void form_component_prediction(src,dst,lx,lx2,w,h,x,y,dx,dy,average_flag) unsigned char *src; unsigned char *dst; int lx; /* raster line increment */ int lx2; int w,h; int x,y; int dx,dy; int average_flag; /* flag that signals bi-directional or Dual-Prime averaging (7.6.7.1 and 7.6.7.4). if average_flag==1, a previously formed prediction has been stored in pel_pred[] */ { #if 0 static const __int64 mmmask_0001 = 0x0001000100010001; static const __int64 mmmask_0002 = 0x0002000200020002; static const __int64 mmmask_0003 = 0x0003000300030003; static const __int64 mmmask_0006 = 0x0006000600060006; unsigned char *s = src + lx * (y + (dy>>1)) + x + (dx>>1); unsigned char *d = dst + lx * y + x; int flag = (average_flag<<2) + ((dx & 1)<<1) + (dy & 1); switch (flag) { case 0: // d[i] = s[i]; __asm { mov eax, [s] mov ebx, [d] mov esi, 0x00 mov edi, [h] mc0: movq mm1, [eax+esi] add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc0 add eax, [lx2] add ebx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc0 emms } break; case 1: // d[i] = (s[i]+s[i+lx]+1)>>1; __asm { pxor mm0, mm0 movq mm7, [mmmask_0001] mov eax, [s] mov ebx, [d] mov ecx, eax add ecx, [lx] mov esi, 0x00 mov edi, [h] mc1: movq mm1, [eax+esi] movq mm2, [ecx+esi] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 paddsw mm1, mm7 paddsw mm3, mm7 psrlw mm1, 1 psrlw mm3, 1 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc1 add eax, [lx2] add ebx, [lx2] add ecx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc1 emms } break; case 2: // d[i] = (s[i]+s[i+1]+1)>>1; __asm { pxor mm0, mm0 movq mm7, [mmmask_0001] mov eax, [s] mov ebx, [d] mov esi, 0x00 mov edi, [h] mc2: movq mm1, [eax+esi] movq mm2, [eax+esi+1] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 paddsw mm1, mm7 paddsw mm3, mm7 psrlw mm1, 1 psrlw mm3, 1 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc2 add eax, [lx2] add ebx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc2 emms } break; case 3: // d[i] = (s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2; __asm { pxor mm0, mm0 movq mm7, [mmmask_0002] mov eax, [s] mov ebx, [d] mov ecx, eax add ecx, [lx] mov esi, 0x00 mov edi, [h] mc3: movq mm1, [eax+esi] movq mm2, [eax+esi+1] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 movq mm5, [ecx+esi] paddsw mm1, mm7 movq mm6, [ecx+esi+1] paddsw mm3, mm7 movq mm2, mm5 movq mm4, mm6 punpcklbw mm2, mm0 punpckhbw mm5, mm0 punpcklbw mm4, mm0 punpckhbw mm6, mm0 paddsw mm2, mm4 paddsw mm5, mm6 paddsw mm1, mm2 paddsw mm3, mm5 psrlw mm1, 2 psrlw mm3, 2 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc3 add eax, [lx2] add ebx, [lx2] add ecx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc3 emms } break; case 4: // d[i] = (s[i]+d[i]+1)>>1; __asm { pxor mm0, mm0 movq mm7, [mmmask_0001] mov eax, [s] mov ebx, [d] mov esi, 0x00 mov edi, [h] mc4: movq mm1, [eax+esi] movq mm2, [ebx+esi] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 paddsw mm1, mm7 paddsw mm3, mm7 psrlw mm1, 1 psrlw mm3, 1 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc4 add eax, [lx2] add ebx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc4 emms } break; case 5: // d[i] = ((d[i]<<1) + s[i]+s[i+lx] + 3)>>2; __asm { pxor mm0, mm0 movq mm7, [mmmask_0003] mov eax, [s] mov ebx, [d] mov ecx, eax add ecx, [lx] mov esi, 0x00 mov edi, [h] mc5: movq mm1, [eax+esi] movq mm2, [ecx+esi] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 movq mm5, [ebx+esi] paddsw mm1, mm7 paddsw mm3, mm7 movq mm6, mm5 punpcklbw mm5, mm0 punpckhbw mm6, mm0 psllw mm5, 1 psllw mm6, 1 paddsw mm1, mm5 paddsw mm3, mm6 psrlw mm1, 2 psrlw mm3, 2 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc5 add eax, [lx2] add ebx, [lx2] add ecx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc5 emms } break; case 6: // d[i] = ((d[i]<<1) + s[i]+s[i+1] + 3) >> 2; __asm { pxor mm0, mm0 movq mm7, [mmmask_0003] mov eax, [s] mov ebx, [d] mov esi, 0x00 mov edi, [h] mc6: movq mm1, [eax+esi] movq mm2, [eax+esi+1] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 movq mm5, [ebx+esi] paddsw mm1, mm7 paddsw mm3, mm7 movq mm6, mm5 punpcklbw mm5, mm0 punpckhbw mm6, mm0 psllw mm5, 1 psllw mm6, 1 paddsw mm1, mm5 paddsw mm3, mm6 psrlw mm1, 2 psrlw mm3, 2 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc6 add eax, [lx2] add ebx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc6 emms } break; default: // d[i] = ((d[i]<<2) + s[i]+s[i+1]+s[i+lx]+s[i+lx+1] + 6)>>3; __asm { pxor mm0, mm0 movq mm7, [mmmask_0006] mov eax, [s] mov ebx, [d] mov ecx, eax add ecx, [lx] mov esi, 0x00 mov edi, [h] mc7: movq mm1, [eax+esi] movq mm2, [eax+esi+1] movq mm3, mm1 movq mm4, mm2 punpcklbw mm1, mm0 punpckhbw mm3, mm0 punpcklbw mm2, mm0 punpckhbw mm4, mm0 paddsw mm1, mm2 paddsw mm3, mm4 movq mm5, [ecx+esi] paddsw mm1, mm7 movq mm6, [ecx+esi+1] paddsw mm3, mm7 movq mm2, mm5 movq mm4, mm6 punpcklbw mm2, mm0 punpckhbw mm5, mm0 punpcklbw mm4, mm0 punpckhbw mm6, mm0 paddsw mm2, mm4 paddsw mm5, mm6 paddsw mm1, mm2 paddsw mm3, mm5 movq mm6, [ebx+esi] movq mm4, mm6 punpcklbw mm4, mm0 punpckhbw mm6, mm0 psllw mm4, 2 psllw mm6, 2 paddsw mm1, mm4 paddsw mm3, mm6 psrlw mm1, 3 psrlw mm3, 3 packuswb mm1, mm0 packuswb mm3, mm0 psllq mm3, 32 por mm1, mm3 add esi, 0x08 cmp esi, [w] movq [ebx+esi-8], mm1 jl mc7 add eax, [lx2] add ebx, [lx2] add ecx, [lx2] sub edi, 0x01 mov esi, 0x00 cmp edi, 0x00 jg mc7 emms } break; } #else static int xint; /* horizontal integer sample vector: analogous to int_vec[0] */ static int yint; /* vertical integer sample vectors: analogous to int_vec[1] */ // static int xh; /* horizontal half sample flag: analogous to half_flag[0] */ // static int yh; /* vertical half sample flag: analogous to half_flag[1] */ static int lx2_x2; /*int i, j, v;*/ static unsigned char *s; /* source pointer: analogous to pel_ref[][] */ static unsigned char *d; /* destination pointer: analogous to pel_pred[][] */ // static unsigned long *p_lsrc; /* DWORD pointer to source */ // static unsigned long *p_ldst; /* DWORD pointer to dest */ static unsigned long *p_Alsrc; /* QWORD aligned version of p_lsrc */ static unsigned long *p_Aldst; /* QWORD aligned version of p_ldst */ static __int64 dSLLamt, dSRLamt; /* dst shift amount for alignment */ static DWORD sSLLamt, sSRLamt; /* src shift amount for alignment */ static DWORD sSLLamt_minus_8; // mmMask definitions moved to global scope, to enforce QWORD alignment lx2_x2 = (lx2<<1); /* two times lx2 */ /* half pel scaling for integer vectors */ xint = dx>>1; yint = dy>>1; /* derive half pel flags */ // xh = dx & 1; // yh = dy & 1; /* compute the linear address of pel_ref[][] and pel_pred[][] based on cartesian/raster cordinates provided */ s = src + lx*(y+yint) + x + xint; d = dst + lx*y + x; // p_lsrc = (unsigned long *) s; // p_ldst = (unsigned long *) d; p_Alsrc = (unsigned long*)((DWORD)s & 0xFFFFFFF8); // calculate QWORD aligned address p_Aldst = (unsigned long*)((DWORD)d & 0xFFFFFFF8); // calculate QWORD aligned address // calculate byte shift amounts sSRLamt = ((DWORD) s )- ((DWORD) p_Alsrc ); dSRLamt = (__int64)(((DWORD) d )- ((DWORD) p_Aldst )); sSRLamt = sSRLamt << 3; // convert byte-shift amt to bit shift amount dSRLamt = dSRLamt << 3; // convert byte-shift amt to bit shift amount sSLLamt = 64 - sSRLamt; dSLLamt = 64 - dSRLamt; sSLLamt_minus_8 = sSLLamt - 8; if (average_flag) { if (!IS_XHALF && !IS_YHALF) /* no horizontal nor vertical half-pel */ { // for (j=0; j<h; j++) /* yes average, fullX, fullY */ // { // for (i=0; i<w; i++) // { // v = d[i]+s[i]; // /*d[i] = (v+(v>=0?1:0))>>1;*/ // d[i] = (v+1)>>1; // } // // s+= lx2; // d+= lx2; // } __asm { // yes average, fullX, fullY // each processes 8-pixels, 8 pixels are written per iteration // there is some minor software pipelining to reduce the // store latency mov edx, [lx2] ; edx <= lx2 mov ecx, 0x00 ; ecx <= 0 movd mm5, dword ptr [sSRLamt ]; // mm5 = right-shift src alignment movd mm6, dword ptr [sSLLamt ]; // mm6 = left-shift src alignment movq mm7, qword ptr [mmMaskFEFE]; mm7 <= 0xFEFEFEFEFE... mov esi, dword ptr [p_Alsrc ] ;esi <= aligned p_lsrc[] mov edi, dword ptr [d ] ;edi <= p_ldst[0] mov ebx, dword ptr [p_Aldst] ; ebx <= aligned p_ldst[] //srl ebx, 3 ; ebx <= (w >> 3) (width / 8) mov eax, [h] ; eax <= h jmp lp_afxfy; // skip the first store lp_afxfy_st: // outer loop for( y =0; y < H; ++y ) movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3 // begin inner loop for ( ecx=0; ecx < ebx; ecx += 8 ) lp_afxfy : // Average, FullX/FullY // the read data operation is fairly invovled, to enforce QWORD // aligned reads movq mm0, qword ptr [ esi + ecx ] ;mm1 <= p_Alsrc[ 8..1 ] // movq mm3, mm0 ; mm3 <= Mask movq mm1, qword ptr [ esi + ecx +8] ;mm1 <= p_Alsrc[ 16..9 ] psrlq mm0, mm5; // align [8..1] movq mm2, qword ptr [ ebx + ecx ] ;mm2 <= p_Aldst[ 8..1 ] psllq mm1, mm6; // align [16..9] movq mm3, qword ptr [ ebx + ecx +8] ;mm2 <= p_Aldst[ 8..1 ] por mm0, mm1; // mm0 = desired pixels src[8..1] psrlq mm2, qword ptr [ dSRLamt ]; movq mm1, mm0; // mm1 = copy of mm0, pixels src[8..1] psllq mm3, qword ptr [ dSLLamt ]; pand mm0, mm7; // mm0 = MSB7 of src[8..1] por mm2, mm3; // mm2 = desired pixels dst[8..1] psrlw mm0, 1; // mm0 = src[8..1] / 2 movq mm3, mm2; // mm3 = copy of mm2, pixels dst[8..1] pand mm2, mm7; // mm2 = MSB7 of dst[8..1] psrlw mm2, 1; // mm2 = dst[8..1] / 2 por mm3, mm1; // mm3 = SRC[] OR DST[] pand mm3, qword ptr [mmMask0101]; // mm3 = "+1" rounding compensation paddusb mm2, mm0; // mm2 = (src+dst) / 2 add ecx, 0x08; paddusb mm3, mm2; // mm3 = final result dst[8..1] cmp ecx, [w]; ; // compare i <> width // instead of storing result here, causing a stall, the store // is moved to the beginning of the loop (past the branch-compare) // movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3 jb lp_afxfy_st; ; // if ( i < width ) loop sub eax, 0x01 ; j = j -1 // since we moved the store *after* the branch, we need to duplicate // the store statement for each branch-possibility movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3 add esi, edx ;p_Alsrc += lx2 add ebx, edx ;p_Aldst += lx2 add edi, edx ;p_ldst += lx2 mov ecx, 0x00000000 ; i = 0; cmp eax, 0 ; compare j <> 0 jg lp_afxfy ; if ( j >= 0 ) loop } // end __asm, end of yes average, fullX, fullY } else if (!IS_XHALF && IS_YHALF) /* no horizontal but vertical half-pel */ { // this MMX routine reads 4 x 2 pixels per iteration // this MMX routine outputs 4 pixels per iteration. // all reads (source and dest) are DWORD aligned __asm { // yes average, fullX, halfY // since the function processes only 4 pixels per iteration, we must // convert sSRLamt and dSRLamt to DWORD alignment values mov edx, dword ptr [sSRLamt ]; // left-shift src alignment mov eax, 0x00; // eax = x mov esi, dword ptr [s] ;//esi = source s[] (row y) and edx, 0x1F; // convert edx -> right-shift src alignment (DWORD) mov ebx, dword ptr [dSRLamt ]; // left-shift dst alignment and esi, 0xFFFFFFFC ; // esi = DWORD aligned source mov ecx, esi; // ecx = copy of esi and ebx, 0x1F; / convert ebx -> right-shift dst alignment (DWORD) movd mm4, dword ptr [d] ;//edi = destination d[] movd mm5, edx; // mm5 = right-shift src alignment (DWORD) add ecx, [lx]; // ecx = source s[] (row y+1) movd mm6, ebx; // mm6 = right-shift dest alignment (DWORD) mov edx, [lx2]; // edx = stride [lx2] pxor mm0, mm0; // mm0 = 0x0000_0000 mov ebx, [h]; // ebx =[heihgt] movq mm7, qword ptr [mmMask0002]; // mm7 = 0x00020002_... jmp lp_afxhy; // skip over initial store lp_afxhya: // for ( y = h; y > 0; --y) movd dword ptr [ edi + eax - 4], mm3; // store dst[4..1] lp_afxhy : // for ( x = 0; x < w; x+= 4) // the read data operation is fairly invovled, to enforce QWORD // aligned reads movd mm1, dword ptr [esi + eax]; // read src[4..1] (y) movd edi, mm4; punpckldq mm1, dword ptr [esi + eax + 4]; // mm1 = src[8..1] (y) and edi, 0xFFFFFFFC; // edi = DWORD aligned dst[4..1] movd mm2, dword ptr [ecx + eax]; // read src[4..1] (y+1) psrlq mm1, mm5; // mm1 = desired input src[4..1] (y) punpckldq mm2, dword ptr [ecx + eax + 4]; // mm2 = src[8..1] (y+1) movd mm3, dword ptr [ edi + eax ]; // mm3 = dst[4..1] psrlq mm2, mm5; // mm2 = desired input src[4..1] (y+1) punpckldq mm3, dword ptr [ edi + eax + 4]; // mm3 = dst[8..1] punpcklbw mm1, mm0; // mm1 = input src[_4 _3 _2 _1] (y) punpcklbw mm2, mm0; // mm2 = input src[_4 _3 _2 _1 ] (y+1) psrlq mm3, mm6; // mm3 = desired input dst[4..1] paddusw mm1, mm7; // mm1 = "+2" src[4..1] (rounding compensation) // we now have all the desired inputs, mm1, mm2, mm3 punpcklbw mm3, mm0; // mm3 = dst[_4 _3 _2 _1] (16-bit values) paddusw mm2, mm1; // mm2 = (src+src) (row y, y+1) movd edi, mm4; // restore copy psllw mm3, 0x01; // mm1 = (dst[ _4 _3 _2 _1] << 1 ) paddusw mm3, mm2; // mm2 = [ src(y)+src(y+1)+ 2*dst ] add eax, 0x04; // x += 4 cmp eax, [w]; // compare ( x <> w ) psrlw mm3, 0x02; // mm3 = (src+src+2*dst)/4 (desired output) packuswb mm3, mm3; // mm3 = ready for output // this store has been relocated to beginning of loop to avoid stall // movd dword ptr [ edi + eax -4], mm3; // store dst[4..1] jl lp_afxhya; // end for ( x =0 ... ) // since we moved store after the branch, we must replicate the store // for all possible outcomes of branch movd dword ptr [ edi + eax -4], mm3; // store dst[4..1] sub ebx, 0x01; // --y add esi, edx; // p_lsrc += lx2; row(y) add edi, edx; // p_ldst += lx2; add ecx, edx; // p_lsrc += lx2; row(y+1) mov eax, 0x00; // x = 0 movd mm4, edi; // move edi => mm4 (backup copy of dest[] pointer ) cmp ebx, 0x00; // compare y <> 0 jg lp_afxhy ; // end for ( y = 0 ... ) } // end __asm, end of yes average, fullX, halfY /* for (j=0; j<h; j++) // yes average, fullX, halfY { for (i=0; i<w; i++) { v = (((unsigned int)d[i])<<1) + ((unsigned int)(s[i]+s[i+lx])); //d[i]=(v+(v>=0?1:0))>>1; d[i]=(v+2)>>2; } s+= lx2; d+= lx2; } */ } else if (IS_XHALF && !IS_YHALF) /* horizontal but no vertical half-pel */ { // this MMX loop generates 8 output-pixels per iteration. // all reads are qword aligned __asm { // yes average, halfX, fullY mov edx, dword ptr [p_Aldst ]; // QWORD aligned dest[] mov eax, 0x00; // eax = x mov esi, dword ptr [p_Alsrc ]; // QWORD aligned src[] // pxor mm2, mm2; // mm2 = 0x0000_0000 mov edi, dword ptr [d] ;//edi = destination d[] mov ebx, [h]; // ebx = y movd mm6, dword ptr [sSLLamt ]; // mm6 = left-shift src alignment movd mm7, dword ptr [sSRLamt ]; // mm7 = right-shift src alignment jmp lp_ahxfy; // skip first store operation lp_ahxfya: // for ( y = h; y > 0; --y ) movq qword ptr [ edi + eax-8], mm1; lp_ahxfy : // for ( x = 0; x < w; x+= 8) // the read data operation is fairly involved, to enforce QWORD // aligned reads movq mm0, qword ptr [ esi + eax ]; // mm0 = p_Alsrc[8..1] // pxor mm2, mm2; // mm2 = 0x0000_0000 movq mm1, qword ptr [ esi + eax + 8]; // mm1 = p_Alsrc[16..9] psrlq mm0, mm7; // sSRL amount // movq mm3, mm1; // copy mm1 to mm3 movq mm2, mm1; // copy mm1 to mm2 psllq mm1, mm6; // sSLL amount movq mm4, qword ptr [ edx + eax]; // mm4 = p_Aldst[8..1] psrlq mm2, mm7; // shift right ( "9th pixel src[xxxxxx9]") movq mm5, qword ptr [ edx + eax + 8]; // mm5 = p_Aldst[16..9] psllw mm2, 8; // mm2 = "[ x_ x_ | x_ 9_ ]" // punpcklbw mm2, mm3; // mm2 = src[12_ 11_ 10_ 9_], desired input psrlq mm4, qword ptr [ dSRLamt ]; // dSRL amount por mm0, mm1; // mm0 = desired input src[8..1] psllq mm5, qword ptr [ dSLLamt ]; // dSLL amount movq mm1, mm0; // mm1 = copy of desired input src[8..1] por mm5, mm4; // desired input mm5 = dst[8..1] psrlw mm0, 8; // mm0 = src[ _8 _6 _4 _2] (even pixels) // we have read all the desired inputs, mm0, mm5, mm2 (and mm1) pand mm1, [mmMask00FF]; // mm1 = src[ _7 _5 _3 _1] (odd pixels) psllq mm2, 40; // mm2 = src [ _9 _ _ _ ] paddusw mm0, [mmMask0002]; // mm0 = "+2" round-up adjustment movq mm4, mm1; // mm4 = copy of [src 7 5 3 1] (odd) psrlq mm4, 16; // mm4 = [ src _ 7 5 3 ] movq mm3, mm5; // mm3 = copy of dest[8..1] psrlw mm5, 0x08; // mm5 = dst[ _8 _6 _4 _2] paddusw mm1, mm0; // mm1 = src[8..0] even+odd pixels pand mm3, [mmMask00FF]; // mm3 = 2*dest[ _7 _5 _4 _1] psllw mm5, 0x01; // mm5 = 2*dst[ _8 _6 _4 _2] por mm2, mm4; // mm4 = src [ _9 _7 _5 _3] psllw mm3, 0x01; // mm3 = 2*dst[ _7 _5 _3 _1] // pixel alignment should look like this // from source [98 87 76 65 54 43 32 21] // from dest-> [ 8 7 6 5 4 3 2 1] // EV ODD EV O E O E O paddusw mm1, mm3; // mm1 = (src+src+2*dst) [ odd pixels ] paddusw mm2, mm5; // mm2 = (src+src+2*dst) [ even pixels ] psrlw mm1, 2; // mm1 = (src+src+2*dst)/4 (__87 __65 __43 __21) paddusw mm2, mm0; // mm2 = (src+src) (__98 __76 __54 __32) psrlw mm2, 2; // mm2 = (src+src)/4 [ __98 __76 __54 __32 ] add eax, 0x08; cmp eax, [ w ]; // compare x <> w psllw mm2, 8; // mm2 = (src+src)/4 [ 98__ 76__ 54__ 32__ ] por mm1, mm2; // mm1 = (src+src)/4 [ 98 87 76 65 54 43 21 ] // this store has been relocated to beginning of loop to avoid stall // movq qword ptr [ edi + eax-8], mm1; jl lp_ahxfya ; // end for ( x =0 ... ) add esi, [lx2]; // p_lsrc += lx2; sub ebx, 0x01; // --y // since we moved store after the branch, we must replicate the store // for all possible outcomes of branch movq qword ptr [ edi + eax-8], mm1; add edi, [lx2]; // p_ldst += lx2; mov eax, 0x00; // x = 0 add edx, [lx2]; // p_Alsrc += lx2; cmp ebx, 0x00; // compare y <> 0 jg lp_ahxfy ; // end for ( y = height ... ) } // end __asm , end of yes-average, halfX, fullY /* for (j=0; j<h; j++) // yes average, halfX, fullY { for (i=0; i<w; i++) { //v = d[i] + ((unsigned int)(s[i]+s[i+1]+1)>>1); v = (((unsigned int)d[i])<<1) + ((unsigned int)(s[i]+s[i+1])); //d[i] = (v+(v>=0?1:0))>>1; d[i] = (v+2)>>2; } s+= lx2; d+= lx2; } */ } else /* if (xh && yh) horizontal and vertical half-pel */ { // this MMX loop generates 4 output-pixels per iteration. // 8 bytes are read from src[] and dst[] each iteration, but // the upper 3 bytes are discarded. __asm { // yes average, halfX, halfY mov esi, dword ptr [p_Alsrc] ;//esi = aligned source p_Alsrc[] mov eax, 0x00; // eax = x movd mm5, dword ptr [sSLLamt ]; // mm5 = left-shift src alignment movd mm6, dword ptr [sSRLamt ]; // mm6 = right-shift src alignment mov edi, dword ptr [d] ;//edi = destination d[] mov ebx, 0x00; // ebx = y mov edx, dword ptr [p_Aldst] ;//edx = aligned dest p_Aldst[] mov ecx, [lx]; //vertical stride (for row y+1) pxor mm7, mm7; // mm7 = 0x0000_0000 // movq mm6, qword ptr [mmMask0004]; // mm6 = 0x00040004_... jmp lp_ahxhy; // skip first store operation lp_ahxhya: // for ( y = 0; y < h; ++y ) movd dword ptr [ edi + eax - 4], mm2; lp_ahxhy : // for ( x = 0; x < w; x+= 4) movq mm0, qword ptr [ esi + eax ]; // 0a) mm0 = p_Alsrc[8..1] row(y) movq mm2, qword ptr [ esi + eax + 8]; // 0a) mm2 = p_Alsrc[16..9] row(y) psrlq mm0, mm6; // shift right mm0 movq mm1, qword ptr [ edx + eax ]; // 2a) mm1 = p_Aldst[8..1] add esi, ecx; // esi = src[] row(y+1) movq mm3, qword ptr [ edx + eax + 8 ]; // 2a) mm3 = p_Aldst[16..9] psllq mm2, mm5; // shift left mm2 psrlq mm1, [ dSRLamt ]; por mm0, mm2; //0a) mm0 = src[8..1] row(y) desired input psllq mm3, [ dSLLamt ]; movq mm4, mm0; // 1b) mm4 = copy of src[8..1] row(y) movq mm2, qword ptr [ esi + eax ]; // 3a) mm2 = src[8..1] row(y+1) por mm1, mm3; // mm1 = dst[8..1] desired input movq mm3, qword ptr [ esi + eax +8]; // 3a) mm2 = src[16..9] row(y+1) psrlq mm2, mm6; paddusw mm4, [mmMask0004]; psllq mm3, mm5; por mm2, mm3; // mm2 = src[7..1] row(y+1) desired input punpcklbw mm4, mm7; // 1c) mm4 = src [4..1] row(y) // 1*) mm4 = src [4..1] row(y) "+04" (rounding comp) psrlq mm0, 0x08; // 1d) mm0 = src [ _8765432 ] row(y) movq mm3, mm2; // 3b) mm3 = copy of src[8..1] row(y+1) punpcklbw mm0, mm7; // 1e) mm0 = src [5..2] row(y) add eax, 0x04; // x += 4; punpcklbw mm1, mm7; // 2b) mm1 = dst [4..1] row(y) paddusw mm0, mm4; // 1f) mm0 = (src+src) [54 43 32 21] row(y) // mm0 = partial result psllw mm1, 0x02; // 2c) mm1 = 4*dst[4..1] sub esi, ecx; // conver esi = p_Alsrc[] row(y), used to be row(y+1) paddusw mm0, mm1; // 2d) mm0 = (4*dst+src(y)), partial result punpcklbw mm3, mm7; // 3c) mm3 = src[4..1] row(y+1) psrlq mm2, 0x08; // 3d) mm2 = src[ _8765432 ] row(y+1) paddusw mm0, mm3; // mm0 = (4*dst+src(y)+(even)src(y+1)) partial result punpcklbw mm2, mm7; // 3e) mm2 = src[5..2] row(y+1) ;//slot paddusw mm2, mm0 // 5) mm2 = (4*dst+src(y)+src(y+1), almost final result ;//slot cmp eax, [w] psrlw mm2, 0x03; // 6) mm2 = [ 4...2 ] / 8 average packuswb mm2, mm2; // 6) mm0 = final result, ready to store // the store has been moved to AFTER the loop, to avoid stall // movd dword ptr [ edi + eax ], mm2; jl lp_ahxhya; // end for ( x = 0; x< w; x+=4 ); // 1 copy of the store is here, because of our software pipelining add esi, [lx2]; add ebx, 0x01; // ++y; movd dword ptr [ edi + eax - 4], mm2; add edi, [lx2]; mov eax, 0x00; // x = 0; add edx, [lx2]; cmp ebx, [h] jl lp_ahxhy; // end for ( y = 0; y < h; ++y ) } // end __asm , end of yes average, halfX, halfy ; /* for (j=0; j<h; j++) // yes average, halfX, halfY { for (i=0; i<w; i++) { //v = d[i] + ((unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2); v = (((unsigned int)d[i])<<2) + ((unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1])); //d[i] = (v+(v>=0?1:0))>>1; d[i] = (v+4)>>3; } s+= lx2; d+= lx2; }*/ } } else { /* -------- NO AVERAGE -------- */ if (!IS_XHALF && !IS_YHALF) /* no horizontal nor vertical half-pel */ { // for (j=0; j<h; j++) /* no average, fullX, fullY */ // { // for (i=0; i<(w>>2); ++i) // { // p_ldst[i] = p_lsrc[i]; // } // // p_lsrc += ( lx2 >> 2 ); // p_ldst += ( lx2 >> 2 ); // } __asm { // no average, fullX, fullY // processes 8*2 pixels per iteration (two rows, 8-pixels per row) // all reads (source and dest) are QWORD aligned movd mm6, [sSRLamt]; // mm6 =right-shift amount to QW align source mov ecx, 0x00; // ecx = x mov edi, dword ptr [d ] ;//edi <= p_ldst[0] mov ebx, [lx2] ; // ebx <= lx2 mov esi, dword ptr [p_Alsrc ] ;esi <= p_lsrc[0] mov edx, ebx ; //edx <= lx2 //srl ebx, 3 ; ebx <= (w >> 3) (width / 8) mov eax, [h] ; //eax <= h add edx, edi ; //edx <= p_ldst[] row(y+1) movd mm7, [sSLLamt]; // mm7 =left-shift amount to QW align source add ebx, esi ; //ebx <= p_lsrc[] row(y+1) jmp lp_fxfy; // skip intial store lp_fxfy_st2: movq qword ptr [ edi + ecx -8], mm0 ;dst[ i ] <= mm1 movq qword ptr [ edx + ecx -8], mm2 ;dst[ i ] <= mm1 // begin inner loop for ( ecx=0; ecx < [w]; ecx += 8 ) lp_fxfy : // no average, FullX/FullY movq mm0, qword ptr [ esi + ecx ] ;mm0 <= p_Alsrc[ i ] (row y) movq mm1, qword ptr [ esi + ecx + 8] ;mm0 <= p_Alsrc[ i ] (row y) psrlq mm0, mm6; // right shift [7..0] to align movq mm2, qword ptr [ ebx + ecx ] ;mm1 <= p_Alsrc[i] (row y+1) psllq mm1, mm7; // left shift [15..8] to align movq mm3, qword ptr [ ebx + ecx + 8] ;mm1 <= p_Alsrc[i] (row y+1) psrlq mm2, mm6; // right shift [7..0] to align por mm0, mm1; // mm0 = desired src[] 8-pixels (row y) psllq mm3, mm7; // left shift [15..8] to align por mm2, mm3; // mm2 = desired src[] 8-pixels (row y+1) add ecx, 0x08; // software pipelining, store-operation has been moved AFTER branch. cmp ecx, [w]; ; // compare i <> width jb lp_fxfy_st2; ; // if ( i < width ) loop // increment src/dst by two rows ( 2*lx2) movq qword ptr [ edi + ecx -8], mm0 ;p_ldst[ i ] <= mm1 add esi, [lx2_x2] ;//p_Alsrc += lx2 ( row y) movq qword ptr [ edx + ecx -8], mm2 ;p_ldst[ i ] <= mm1 add ebx, [lx2_x2] ;//p_Alsrc += lx2 ( row y+1) sub eax, 2 ;// j = j -2 add edi, [lx2_x2] ;//p_ldst += lx2 ( row y) mov ecx, 0x00 ;// i = 0; add edx, [lx2_x2] ;//p_ldst += lx2 ( row y+1) cmp eax, 0 ; compare j <> 0 jg lp_fxfy ; if ( j > 0 ) loop } // end __asm , end of no average, fullX, fullY } else if (!IS_XHALF && IS_YHALF) /* no horizontal but vertical half-pel */ { // this MMX routine produces 8 pixels per iteration. // all source reads are qword aligned __asm { // no average, fullX, halfY mov esi, dword ptr [p_Alsrc] ;//esi = aligned source s[] (row y) mov eax, 0x00; // eax = x movq mm7, qword ptr [mmMask0101]; // mm7 = 0x01010101_... mov ecx, esi; // ecx = copy of esi movq mm4, qword ptr [mmMaskFEFE]; // mm4 = 0xFEFEFEFE_... mov edi, dword ptr [d] ;//edi = destination d[] mov ebx, 0x00; // ebx = y add ecx, [lx]; // ecx = source s[] (row y+1) mov edx, [lx2]; // edx = stride [lx2] movd mm5, [sSRLamt]; // mm5=source right shift amount to align movd mm6, [sSLLamt]; // mm6=source left shift amount to align jmp lp_fxhy; // skip over initial store lp_fxhya: // for ( y = 0; y < h; ++y) movq qword ptr [ edi + eax - 8], mm3; // store dst[8..1] pxor mm2,mm2; // mm2 = 0x0000_0000 lp_fxhy : // for ( x = 0; x < w; x+= 8) movq mm0, qword ptr [ esi + eax ]; // mm0 = src[8..1] (y) movq mm1, qword ptr [ esi + eax + 8 ]; // mm1 = src[16..9] (y) psrlq mm0, mm5; // align src[8..1] (y) movq mm2, qword ptr [ ecx + eax ]; // mm2 = src[8..1] (y+1) psllq mm1, mm6; // align src[16..9] (y) movq mm3, qword ptr [ ecx + eax +8]; // mm3 = src[16..9] (y+1) psrlq mm2, mm5; // align src[8..1] (y+1) psllq mm3, mm6; // align src[16..9] (y+1) por mm0, mm1; // mm0 = desired pixels, src[ 8..1] (y) movq mm1, mm0; // mm1 = copy of mm0, src[ 8..1] (y) por mm2, mm3; // mm2 = desired pixels, src[8..1] (y+1) movq mm3, mm2; // mm3 = copy of mm2, src[ 8..1] (y+1) por mm2, mm0; // mm2 = src(y) OR src(y+1) pand mm1, mm4; // mm1 = MSB7 of src[8..1] (y) pand mm2, mm7; // mm2 = "+1" src[8..1] (round adjustment) pand mm3, mm4; // mm3 = MSB7 of src[8..1] (y+1) psrlw mm1, 1; // mm1 = src[8..1] / 2 (y) psrlw mm3, 1; // mm3 = src[8..1] / 2 (y+1) paddusb mm2, mm1; // mm2 = partial result paddusb mm3, mm2; // mm3 = (src+src) src[8..1] final result add eax, 0x08; // x += 8 cmp eax, [w]; // compare x <> w // this store has been relocated to beginning of loop to avoid stall // movd dword ptr [ edi + eax -8], mm2; // store dst[8..1] jl lp_fxhya; // end for ( x =0 ... ) // since we moved store after the branch, we must replicate the store // for all possible outcomes of branch movq qword ptr [ edi + eax -8], mm3; // store dst[8..1] add ebx, 0x01; // ++y add esi, edx; // p_lsrc += lx2; row(y) pxor mm2,mm2; // mm2 = 0x0000_0000 add edi, edx; // p_ldst += lx2; mov eax, 0x00; // x = 0 add ecx, edx; // p_lsrc += lx2; row(y+1) cmp ebx, [ h ]; // compare y <> height jl lp_fxhy ; // end for ( y = 0 ... ) } // end __asm , end of no average, fullX, halfY /* for (j=0; j<h; j++) // no average, fullX, halfY { for (i=0; i<w; i++) { d[i] = (unsigned int)(s[i]+s[i+lx]+1)>>1; } s+= lx2; d+= lx2; } */ } else if (IS_XHALF && !IS_YHALF) /* horizontal but no vertical half-pel */ { // this MMX loop generates 8 output-pixels per iteration. // all reads are QWORD aligned __asm { /* no average, halfX, fullY */ mov esi, dword ptr [p_Alsrc ] ;//esi = aligned source s[] mov eax, 0x00; // eax = x movd mm5, [sSRLamt]; // mm5 = src shift-right amount... movd mm6, [sSLLamt]; // mm6 = src shift-left amount... movd mm4, [sSLLamt_minus_8]; // mm4 = (mm6 - 8) mov edi, dword ptr [d] ;//edi = destination d[] mov ebx, [h]; // ebx = y movq mm7, qword ptr [mmMask0101]; // mm7 = 0x01010101_... jmp lp_hxfy; // skip first store operation lp_hxfya: // for ( y = 0; y < h; ++y ) movq qword ptr [ edi + eax-8], mm3; lp_hxfy : // for ( x = 0; x < w; x+= 8) movq mm0, qword ptr [ esi + eax ]; // mm0 = p_Alsrc[8..1] movq mm1, qword ptr [ esi + eax + 8]; // mm0 = p_Alsrc[16..9] psrlq mm0, mm5; // align src[8..1] (y) movq mm3, mm1; // copy mm3 = p_Alsrc[16..9] psllq mm1, mm6; // align src[16..] (y) movq mm2, mm0; // copy mm2 = src[ 8..1] (post shift) psllq mm3, mm4; // mm3 = src[16..9] psrlq mm2, 8; // mm2 = src[ _ 8..2] (top byte 00'd) por mm0, mm1; // mm0 = desired data src[8..1] por mm2, mm3; // mm2 = desired data src[9..2] movq mm1, mm0; // mm1 = copy of mm0, src[8..1] movq mm3, mm2; // mm3 = copy of mm2, src[9..2] por mm2, mm0; // mm0 = LSBits of src[ 8..1 ] pand mm1, qword ptr [mmMaskFEFE]; // mm1 = MSB7 of src[8..1] pand mm2, mm7; // mm2 = "+1" round compensation pand mm3, qword ptr [mmMaskFEFE]; // mm3 = MSB7 of src[9..2] psrlw mm1, 1; // mm1 = 1/2 src[8..1] paddusb mm2, mm1; // mm2 = partial result psrlw mm3, 1; // mm3 = 1/2 src[9..2] paddusb mm3, mm2; // mm3 = final result (src+src) [98 87 76 65 54 43 32 21] add eax, 0x08; cmp eax, [ w ]; // compare x <> w jl lp_hxfya ; // end for ( x =0 ... ) add esi, [lx2]; // p_lsrc += lx2; sub ebx, 0x01; // ++y // since we moved store after the branch, we must replicate the store // for all possible outcomes of branch movq qword ptr [ edi + eax-8], mm3; mov eax, 0x00; // x = 0 add edi, [lx2]; // p_ldst += lx2; cmp ebx, 0; // compare y <> 0 jg lp_hxfy ; // end for ( y = 0 ... ) } // end __asm /* for (j=0; j<h; j++) // no average, halfX, fullY { for (i=0; i<w; i++) { d[i] = (unsigned int)(s[i]+s[i+1]+1)>>1; } s+= lx2; d+= lx2; } */ } else /* if (xh && yh) horizontal and vertical half-pel */ { // this MMX loop generates 4 output-pixels per iteration. // 8 bytes are read from src[] and dst[] each iteration, but // the upper bytes are discarded. // the reads are QWORD aligned __asm { /* no average, halfX, halfy*/ mov esi, dword ptr [p_Alsrc];//esi = aligned source row(y) mov eax, 0x00; // eax = x mov edi, dword ptr [d] ;//edi = destination d[] mov ebx, 0x00; // ebx = y mov edx, [lx2]; // edx = stride [lx2] mov ecx, esi; // copy of esi add ecx, [lx]; // ecx = source s[] row(y+1) pxor mm7, mm7; // mm7 = 0x0000_0000 movd mm5, [sSRLamt]; // mm5 = src shift-right amount... movd mm6, [sSLLamt]; // mm6 = src shift-left amount... // movq mm6, qword ptr [mmMask0002]; // mm6 = 0x00020002_.. jmp lp_hxhy; // skip first store operation lp_hxhya: // for ( y = 0; y < h; ++y ) movd dword ptr [ edi + eax - 4], mm2; lp_hxhy : // for ( x = 0; x < w; x+= 4) movq mm0, qword ptr [ esi + eax ]; // 1a) mm0 = src[8..1] row(y) movq mm1, qword ptr [ esi + eax +8]; // 1a) mm0 = src[16..9] row(y) psrlq mm0, mm5; // align src[8..1] row(y) movq mm2, qword ptr [ ecx + eax ]; // 3a) mm2 = src[8..1] row(y+1) psllq mm1, mm6; // align src[16..9] row(y) movq mm3, qword ptr [ ecx + eax +8]; // 3a) mm2 = src[16..9] row(y+1) psrlq mm2, mm5; // align src[8..1] row(y+1) por mm0, mm1; // mm0 = desired pixels src[8..1] row(y) psllq mm3, mm6; // align src[16..9] row(y+1) por mm2, mm3; // mm2 = desired pixels src[8..1] row(y+1) movq mm4, mm0; // 1b) mm4 = copy of src[8..1] row(y) punpcklbw mm4, mm7; // 1c) mm4 = src [4..1] row(y) add eax, 0x04; // x += 4; paddusw mm4, [mmMask0002]; // 1*) mm4 = src [4..1] row(y) "+02" (rounding comp) psrlq mm0, 0x08; // 1d) mm0 = src [ _8765432 ] row(y) punpcklbw mm0, mm7; // 1e) mm0 = src [5..2] row(y) movq mm3, mm2; // 3b) mm3 = copy of src[8..1] row(y+1) punpcklbw mm3, mm7; // 3c) mm3 = src[4..1] row(y+1) paddusw mm0, mm4; // 1f) mm0 = (src+src) [54 43 32 21] row(y) // mm0 = partial result psrlq mm2, 0x08; // 3d) mm2 = src[ _8765432 ] row(y+1) paddusw mm3, mm0; // partial result mm3 = src(y) + (even)src(y+1) punpcklbw mm2, mm7; // 3e) mm2 = src[5..2] row(y+1) ;//slot paddusw mm2, mm3 ;// 5) mm2 = (src(y)+src(y+1), partial result ;//slot psrlw mm2, 0x02; // mm2 = (src+src)/4 average, 16-bit, almost final ;//slot packuswb mm2, mm2; // 6) mm2 = 8-bit pix final result, ready to store cmp eax, [w] // the store has been moved to AFTER the loop, to avoid stall // movd dword ptr [ edi + eax ], mm2; jl lp_hxhya; // end for ( x = 0; x< w; x+=4 ); // 1 copy of the store is here, because of our software pipelining movd dword ptr [ edi + eax - 4], mm2; add ebx, 0x01; // ++y; add esi, edx; // src(row y) += lx2 mov eax, 0x00; // x = 0; add ecx, edx; // src(row y+1) += lx2 add edi, edx; // dst(row y) += lx2 cmp ebx, [h] jl lp_hxhy; // end for ( y = 0; y < h; ++y ) } // end __asm /* for (j=0; j<h; j++) // no average, halfX, halfY { for (i=0; i<w; i++) { d[i] = (unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2; } s+= lx2; d+= lx2; } */ } } __asm emms; #endif } // form_component_prediction()