home *** CD-ROM | disk | FTP | other *** search
- /* Predict.c, motion compensation routines */
- /* Last modified by PX3 July 30 2000 Optimizations done */
-
- /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
-
- /*
- * Disclaimer of Warranty
- *
- * These software programs are available to the user without any license fee or
- * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
- * any and all warranties, whether express, implied, or statuary, including any
- * implied warranties or merchantability or of fitness for a particular
- * purpose. In no event shall the copyright-holder be liable for any
- * incidental, punitive, or consequential damages of any kind whatsoever
- * arising from the use of these programs.
- *
- * This disclaimer of warranty extends to the user of these programs and user's
- * customers, employees, agents, transferees, successors, and assigns.
- *
- * The MPEG Software Simulation Group does not represent or warrant that the
- * programs furnished hereunder are free of infringement of any third-party
- * patents.
- *
- * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
- * are subject to royalty fees to patent holders. Many of these patents are
- * general enough such that they are unavoidable regardless of implementation
- * design.
- *
- */
-
- #include <stdio.h>
-
- #include "config.h"
- #include "global.h"
- //#include "getbits.h"
-
- /* private prototypes */
- __inline static void form_prediction _ANSI_ARGS_((unsigned char *src[], int sfield,
- unsigned char *dst[], int dfield,
- int lx, int lx2, int w, int h, int x, int y, int dx, int dy,
- int average_flag));
-
- __inline static void form_component_prediction _ANSI_ARGS_((unsigned char *src, unsigned char *dst,
- int lx, int lx2, int w, int h, int x, int y, int dx, int dy, int average_flag));
-
- /* private data */
- // 0.16B3, global scope declaration ensures proper data alignment
- static const __int64 mmMask0001= 0x0001000100010001;
- static const __int64 mmMask0002= 0x0002000200020002;
- static const __int64 mmMask0004= 0x0004000400040004;
- static const __int64 mmMask0101= 0x0101010101010101;
- static const __int64 mmMask00FF= 0x00FF00FF00FF00FF;
- static const __int64 mmMaskFFFF= 0xFFFFFFFFFFFFFFFF;
- static const __int64 mmMaskFEFE= 0xFEFEFEFEFEFEFEFE;
- static const __int64 mmMask9th = 0x00FF000000000000;
-
-
- void
- form_predictions(bx,by,macroblock_type,motion_type,PMV,motion_vertical_field_select,dmvector,stwtype)
- int bx, by;
- int macroblock_type;
- int motion_type;
- int PMV[2][2][2], motion_vertical_field_select[2][2], dmvector[2];
- int stwtype;
- {
- static int currentfield;
- static unsigned char **predframe;
- static int DMV[2][2];
- static int stwtop, stwbot;
-
- stwtop = stwtype%3; /* 0:temporal, 1:(spat+temp)/2, 2:spatial */
- stwbot = stwtype/3;
-
- if ((macroblock_type & MACROBLOCK_MOTION_FORWARD)
- || (picture_coding_type==P_TYPE))
- {
- if (picture_structure==FRAME_PICTURE)
- {
- if ((motion_type==MC_FRAME)
- || !(macroblock_type & MACROBLOCK_MOTION_FORWARD))
- {
- /* frame-based prediction (broken into top and bottom halves
- for spatial scalability prediction purposes) */
- if (stwtop<2)
- form_prediction(forward_reference_frame,0,current_frame,0,
- Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by,
- PMV[0][0][0],PMV[0][0][1],stwtop);
-
- if (stwbot<2)
- form_prediction(forward_reference_frame,1,current_frame,1,
- Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by,
- PMV[0][0][0],PMV[0][0][1],stwbot);
- }
- else if (motion_type==MC_FIELD) /* field-based prediction */
- {
- /* top field prediction */
- if (stwtop<2)
- form_prediction(forward_reference_frame,motion_vertical_field_select[0][0],
- current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by>>1,PMV[0][0][0],PMV[0][0][1]>>1,stwtop);
-
- /* bottom field prediction */
- if (stwbot<2)
- form_prediction(forward_reference_frame,motion_vertical_field_select[1][0],
- current_frame,1,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by>>1,PMV[1][0][0],PMV[1][0][1]>>1,stwbot);
- }
- else if (motion_type==MC_DMV) /* dual prime prediction */
- {
- /* calculate derived motion vectors */
- Dual_Prime_Arithmetic(DMV,dmvector,PMV[0][0][0],PMV[0][0][1]>>1);
-
- if (stwtop<2)
- {
- /* predict top field from top field */
- form_prediction(forward_reference_frame,0,current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1,
- PMV[0][0][0],PMV[0][0][1]>>1,0);
-
- /* predict and add to top field from bottom field */
- form_prediction(forward_reference_frame,1,current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1,
- DMV[0][0],DMV[0][1],1);
- }
-
- if (stwbot<2)
- {
- /* predict bottom field from bottom field */
- form_prediction(forward_reference_frame,1,current_frame,1,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1,
- PMV[0][0][0],PMV[0][0][1]>>1,0);
-
- /* predict and add to bottom field from top field */
- form_prediction(forward_reference_frame,0,current_frame,1,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by>>1,
- DMV[1][0],DMV[1][1],1);
- }
- }
- else
- /* invalid motion_type */
- printf("invalid motion_type\n");
- }
- else /* TOP_FIELD or BOTTOM_FIELD */
- {
- /* field picture */
- currentfield = (picture_structure==BOTTOM_FIELD);
-
- /* determine which frame to use for prediction */
- if ((picture_coding_type==P_TYPE) && Second_Field
- && (currentfield!=motion_vertical_field_select[0][0]))
- predframe = backward_reference_frame; /* same frame */
- else
- predframe = forward_reference_frame; /* previous frame */
-
- if ((motion_type==MC_FIELD)
- || !(macroblock_type & MACROBLOCK_MOTION_FORWARD))
- {
- /* field-based prediction */
- if (stwtop<2)
- form_prediction(predframe,motion_vertical_field_select[0][0],current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by,
- PMV[0][0][0],PMV[0][0][1],stwtop);
- }
- else if (motion_type==MC_16X8)
- {
- if (stwtop<2)
- {
- form_prediction(predframe,motion_vertical_field_select[0][0],current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by,
- PMV[0][0][0],PMV[0][0][1],stwtop);
-
- /* determine which frame to use for lower half prediction */
- if ((picture_coding_type==P_TYPE) && Second_Field
- && (currentfield!=motion_vertical_field_select[1][0]))
- predframe = backward_reference_frame; /* same frame */
- else
- predframe = forward_reference_frame; /* previous frame */
-
- form_prediction(predframe,motion_vertical_field_select[1][0],current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,bx,by+8,
- PMV[1][0][0],PMV[1][0][1],stwtop);
- }
- }
- else if (motion_type==MC_DMV) /* dual prime prediction */
- {
- if (Second_Field)
- predframe = backward_reference_frame; /* same frame */
- else
- predframe = forward_reference_frame; /* previous frame */
-
- /* calculate derived motion vectors */
- Dual_Prime_Arithmetic(DMV,dmvector,PMV[0][0][0],PMV[0][0][1]);
-
- /* predict from field of same parity */
- form_prediction(forward_reference_frame,currentfield,current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by,
- PMV[0][0][0],PMV[0][0][1],0);
-
- /* predict from field of opposite parity */
- form_prediction(predframe,!currentfield,current_frame,0,
- Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,bx,by,
- DMV[0][0],DMV[0][1],1);
- }
- else
- /* invalid motion_type */
- printf("invalid motion_type\n");
- }
- stwtop = stwbot = 1;
- }
-
- if (macroblock_type & MACROBLOCK_MOTION_BACKWARD)
- {
- if (picture_structure==FRAME_PICTURE)
- {
- if (motion_type==MC_FRAME)
- {
- /* frame-based prediction */
- if (stwtop<2)
- form_prediction(backward_reference_frame,0,current_frame,0,
- Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by,
- PMV[0][1][0],PMV[0][1][1],stwtop);
-
- if (stwbot<2)
- form_prediction(backward_reference_frame,1,current_frame,1,
- Coded_Picture_Width,Coded_Picture_Width<<1,16,8,bx,by,
- PMV[0][1][0],PMV[0][1][1],stwbot);
- }
- else /* field-based prediction */
- {
- /* top field prediction */
- if (stwtop<2)
- form_prediction(backward_reference_frame,motion_vertical_field_select[0][1],
- current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by>>1,PMV[0][1][0],PMV[0][1][1]>>1,stwtop);
-
- /* bottom field prediction */
- if (stwbot<2)
- form_prediction(backward_reference_frame,motion_vertical_field_select[1][1],
- current_frame,1,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by>>1,PMV[1][1][0],PMV[1][1][1]>>1,stwbot);
- }
- }
- else /* TOP_FIELD or BOTTOM_FIELD */
- {
- /* field picture */
- if (motion_type==MC_FIELD)
- {
- /* field-based prediction */
- form_prediction(backward_reference_frame,motion_vertical_field_select[0][1],
- current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,16,
- bx,by,PMV[0][1][0],PMV[0][1][1],stwtop);
- }
- else if (motion_type==MC_16X8)
- {
- form_prediction(backward_reference_frame,motion_vertical_field_select[0][1],
- current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by,PMV[0][1][0],PMV[0][1][1],stwtop);
-
- form_prediction(backward_reference_frame,motion_vertical_field_select[1][1],
- current_frame,0,Coded_Picture_Width<<1,Coded_Picture_Width<<1,16,8,
- bx,by+8,PMV[1][1][0],PMV[1][1][1],stwtop);
- }
- else
- /* invalid motion_type */
- printf("invalid motion_type\n");
- }
- }
- }
-
- static void form_prediction(src,sfield,dst,dfield,lx,lx2,w,h,x,y,dx,dy,average_flag)
- unsigned char *src[]; /* prediction source buffer */
- int sfield; /* prediction source field number (0 or 1) */
- unsigned char *dst[]; /* prediction destination buffer */
- int dfield; /* prediction destination field number (0 or 1)*/
- int lx,lx2; /* line strides */
- int w,h; /* prediction block/sub-block width, height */
- int x,y; /* pixel co-ordinates of top-left sample in current MB */
- int dx,dy; /* horizontal, vertical prediction address */
- int average_flag; /* add prediction error to prediction ? */
- {
- /* Y */
- form_component_prediction(src[0]+(sfield?lx2>>1:0),dst[0]+(dfield?lx2>>1:0),
- lx,lx2,w,h,x,y,dx,dy,average_flag);
- /*
- if (chroma_format!=CHROMA444)
- {
- lx>>=1; lx2>>=1; w>>=1; x>>=1; dx/=2;
- }
-
- if (chroma_format==CHROMA420)
- {
- h>>=1; y>>=1; dy/=2;
- }
- */
- if (chroma_format!=CHROMA444)
- {
- lx>>=1; lx2>>=1; w>>=1; x>>=1; dx/=2;
-
- if (chroma_format==CHROMA420)
- {
- h>>=1; y>>=1; dy/=2;
- }
- }
-
-
- /* Cb */
- form_component_prediction(src[1]+(sfield?lx2>>1:0),dst[1]+(dfield?lx2>>1:0),
- lx,lx2,w,h,x,y,dx,dy,average_flag);
-
- /* Cr */
- form_component_prediction(src[2]+(sfield?lx2>>1:0),dst[2]+(dfield?lx2>>1:0),
- lx,lx2,w,h,x,y,dx,dy,average_flag);
-
- }
-
- /* ISO/IEC 13818-2 section 7.6.4: Forming predictions */
- /* NOTE: the arithmetic below produces numerically equivalent results
- * to 7.6.4, yet is more elegant. It differs in the following ways:
- *
- * 1. the vectors (dx, dy) are based on cartesian frame
- * coordiantes along a half-pel grid (always positive numbers)
- * In contrast, vector[r][s][t] are differential (with positive and
- * negative values). As a result, deriving the integer vectors
- * (int_vec[t]) from dx, dy is accomplished by a simple right shift.
- *
- * 2. Half pel flags (xh, yh) are equivalent to the LSB (Least
- * Significant Bit) of the half-pel coordinates (dx,dy).
- *
- *
- * NOTE: the work of combining predictions (ISO/IEC 13818-2 section 7.6.7)
- * is distributed among several other stages. This is accomplished by
- * folding line offsets into the source and destination (src,dst)
- * addresses (note the call arguments to form_prediction() in Predict()),
- * line stride variables lx and lx2, the block dimension variables (w,h),
- * average_flag, and by the very order in which Predict() is called.
- * This implementation design (implicitly different than the spec)
- * was chosen for its elegance.
- */
-
- #define IS_XHALF ( dx & 0x01 )
- #define IS_YHALF ( dy & 0x01 )
-
- static
- void form_component_prediction(src,dst,lx,lx2,w,h,x,y,dx,dy,average_flag)
- unsigned char *src;
- unsigned char *dst;
- int lx; /* raster line increment */
- int lx2;
- int w,h;
- int x,y;
- int dx,dy;
- int average_flag; /* flag that signals bi-directional or Dual-Prime
- averaging (7.6.7.1 and 7.6.7.4). if average_flag==1,
- a previously formed prediction has been stored in
- pel_pred[] */
- {
-
- #if 0
- static const __int64 mmmask_0001 = 0x0001000100010001;
- static const __int64 mmmask_0002 = 0x0002000200020002;
- static const __int64 mmmask_0003 = 0x0003000300030003;
- static const __int64 mmmask_0006 = 0x0006000600060006;
-
- unsigned char *s = src + lx * (y + (dy>>1)) + x + (dx>>1);
- unsigned char *d = dst + lx * y + x;
- int flag = (average_flag<<2) + ((dx & 1)<<1) + (dy & 1);
-
- switch (flag)
- {
- case 0:
- // d[i] = s[i];
- __asm
- {
- mov eax, [s]
- mov ebx, [d]
- mov esi, 0x00
- mov edi, [h]
- mc0:
- movq mm1, [eax+esi]
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc0
-
- add eax, [lx2]
- add ebx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc0
-
- emms
- }
- break;
-
- case 1:
- // d[i] = (s[i]+s[i+lx]+1)>>1;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0001]
- mov eax, [s]
- mov ebx, [d]
- mov ecx, eax
- add ecx, [lx]
- mov esi, 0x00
- mov edi, [h]
- mc1:
- movq mm1, [eax+esi]
- movq mm2, [ecx+esi]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- paddsw mm1, mm7
- paddsw mm3, mm7
-
- psrlw mm1, 1
- psrlw mm3, 1
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc1
-
- add eax, [lx2]
- add ebx, [lx2]
- add ecx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc1
-
- emms
- }
- break;
-
- case 2:
- // d[i] = (s[i]+s[i+1]+1)>>1;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0001]
- mov eax, [s]
- mov ebx, [d]
- mov esi, 0x00
- mov edi, [h]
- mc2:
- movq mm1, [eax+esi]
- movq mm2, [eax+esi+1]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- paddsw mm1, mm7
- paddsw mm3, mm7
-
- psrlw mm1, 1
- psrlw mm3, 1
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc2
-
- add eax, [lx2]
- add ebx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc2
-
- emms
- }
- break;
-
- case 3:
- // d[i] = (s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0002]
- mov eax, [s]
- mov ebx, [d]
- mov ecx, eax
- add ecx, [lx]
- mov esi, 0x00
- mov edi, [h]
- mc3:
- movq mm1, [eax+esi]
- movq mm2, [eax+esi+1]
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- movq mm5, [ecx+esi]
- paddsw mm1, mm7
-
- movq mm6, [ecx+esi+1]
- paddsw mm3, mm7
-
- movq mm2, mm5
- movq mm4, mm6
-
- punpcklbw mm2, mm0
- punpckhbw mm5, mm0
-
- punpcklbw mm4, mm0
- punpckhbw mm6, mm0
-
- paddsw mm2, mm4
- paddsw mm5, mm6
-
- paddsw mm1, mm2
- paddsw mm3, mm5
-
- psrlw mm1, 2
- psrlw mm3, 2
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc3
-
- add eax, [lx2]
- add ebx, [lx2]
- add ecx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc3
-
- emms
- }
- break;
-
- case 4:
- // d[i] = (s[i]+d[i]+1)>>1;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0001]
- mov eax, [s]
- mov ebx, [d]
- mov esi, 0x00
- mov edi, [h]
- mc4:
- movq mm1, [eax+esi]
- movq mm2, [ebx+esi]
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- paddsw mm1, mm7
- paddsw mm3, mm7
-
- psrlw mm1, 1
- psrlw mm3, 1
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc4
-
- add eax, [lx2]
- add ebx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc4
-
- emms
- }
- break;
-
- case 5:
- // d[i] = ((d[i]<<1) + s[i]+s[i+lx] + 3)>>2;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0003]
- mov eax, [s]
- mov ebx, [d]
- mov ecx, eax
- add ecx, [lx]
- mov esi, 0x00
- mov edi, [h]
- mc5:
- movq mm1, [eax+esi]
- movq mm2, [ecx+esi]
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- movq mm5, [ebx+esi]
-
- paddsw mm1, mm7
- paddsw mm3, mm7
-
- movq mm6, mm5
- punpcklbw mm5, mm0
- punpckhbw mm6, mm0
-
- psllw mm5, 1
- psllw mm6, 1
-
- paddsw mm1, mm5
- paddsw mm3, mm6
-
- psrlw mm1, 2
- psrlw mm3, 2
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc5
-
- add eax, [lx2]
- add ebx, [lx2]
- add ecx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc5
-
- emms
- }
- break;
-
- case 6:
- // d[i] = ((d[i]<<1) + s[i]+s[i+1] + 3) >> 2;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0003]
- mov eax, [s]
- mov ebx, [d]
- mov esi, 0x00
- mov edi, [h]
- mc6:
- movq mm1, [eax+esi]
- movq mm2, [eax+esi+1]
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- movq mm5, [ebx+esi]
-
- paddsw mm1, mm7
- paddsw mm3, mm7
-
- movq mm6, mm5
- punpcklbw mm5, mm0
- punpckhbw mm6, mm0
-
- psllw mm5, 1
- psllw mm6, 1
-
- paddsw mm1, mm5
- paddsw mm3, mm6
-
- psrlw mm1, 2
- psrlw mm3, 2
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc6
-
- add eax, [lx2]
- add ebx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc6
-
- emms
- }
- break;
-
- default:
- // d[i] = ((d[i]<<2) + s[i]+s[i+1]+s[i+lx]+s[i+lx+1] + 6)>>3;
- __asm
- {
- pxor mm0, mm0
- movq mm7, [mmmask_0006]
- mov eax, [s]
- mov ebx, [d]
- mov ecx, eax
- add ecx, [lx]
- mov esi, 0x00
- mov edi, [h]
- mc7:
- movq mm1, [eax+esi]
- movq mm2, [eax+esi+1]
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- paddsw mm1, mm2
- paddsw mm3, mm4
-
- movq mm5, [ecx+esi]
- paddsw mm1, mm7
-
- movq mm6, [ecx+esi+1]
- paddsw mm3, mm7
-
- movq mm2, mm5
- movq mm4, mm6
-
- punpcklbw mm2, mm0
- punpckhbw mm5, mm0
-
- punpcklbw mm4, mm0
- punpckhbw mm6, mm0
-
- paddsw mm2, mm4
- paddsw mm5, mm6
-
- paddsw mm1, mm2
- paddsw mm3, mm5
-
- movq mm6, [ebx+esi]
-
- movq mm4, mm6
- punpcklbw mm4, mm0
- punpckhbw mm6, mm0
-
- psllw mm4, 2
- psllw mm6, 2
-
- paddsw mm1, mm4
- paddsw mm3, mm6
-
- psrlw mm1, 3
- psrlw mm3, 3
-
- packuswb mm1, mm0
- packuswb mm3, mm0
-
- psllq mm3, 32
- por mm1, mm3
-
- add esi, 0x08
- cmp esi, [w]
- movq [ebx+esi-8], mm1
- jl mc7
-
- add eax, [lx2]
- add ebx, [lx2]
- add ecx, [lx2]
- sub edi, 0x01
- mov esi, 0x00
- cmp edi, 0x00
- jg mc7
-
- emms
- }
- break;
- }
-
- #else
-
- static int xint; /* horizontal integer sample vector: analogous to int_vec[0] */
- static int yint; /* vertical integer sample vectors: analogous to int_vec[1] */
- // static int xh; /* horizontal half sample flag: analogous to half_flag[0] */
- // static int yh; /* vertical half sample flag: analogous to half_flag[1] */
- static int lx2_x2;
- /*int i, j, v;*/
- static unsigned char *s; /* source pointer: analogous to pel_ref[][] */
- static unsigned char *d; /* destination pointer: analogous to pel_pred[][] */
- // static unsigned long *p_lsrc; /* DWORD pointer to source */
- // static unsigned long *p_ldst; /* DWORD pointer to dest */
- static unsigned long *p_Alsrc; /* QWORD aligned version of p_lsrc */
- static unsigned long *p_Aldst; /* QWORD aligned version of p_ldst */
- static __int64 dSLLamt, dSRLamt; /* dst shift amount for alignment */
- static DWORD sSLLamt, sSRLamt; /* src shift amount for alignment */
- static DWORD sSLLamt_minus_8;
-
- // mmMask definitions moved to global scope, to enforce QWORD alignment
-
- lx2_x2 = (lx2<<1); /* two times lx2 */
-
- /* half pel scaling for integer vectors */
- xint = dx>>1;
- yint = dy>>1;
-
- /* derive half pel flags */
- // xh = dx & 1;
- // yh = dy & 1;
-
- /* compute the linear address of pel_ref[][] and pel_pred[][]
- based on cartesian/raster cordinates provided */
- s = src + lx*(y+yint) + x + xint;
- d = dst + lx*y + x;
-
- // p_lsrc = (unsigned long *) s;
- // p_ldst = (unsigned long *) d;
-
- p_Alsrc = (unsigned long*)((DWORD)s & 0xFFFFFFF8); // calculate QWORD aligned address
- p_Aldst = (unsigned long*)((DWORD)d & 0xFFFFFFF8); // calculate QWORD aligned address
-
- // calculate byte shift amounts
- sSRLamt = ((DWORD) s )- ((DWORD) p_Alsrc );
- dSRLamt = (__int64)(((DWORD) d )- ((DWORD) p_Aldst ));
- sSRLamt = sSRLamt << 3; // convert byte-shift amt to bit shift amount
- dSRLamt = dSRLamt << 3; // convert byte-shift amt to bit shift amount
- sSLLamt = 64 - sSRLamt;
- dSLLamt = 64 - dSRLamt;
- sSLLamt_minus_8 = sSLLamt - 8;
-
- if (average_flag)
- {
- if (!IS_XHALF && !IS_YHALF) /* no horizontal nor vertical half-pel */
- {
- // for (j=0; j<h; j++) /* yes average, fullX, fullY */
- // {
- // for (i=0; i<w; i++)
- // {
- // v = d[i]+s[i];
- // /*d[i] = (v+(v>=0?1:0))>>1;*/
- // d[i] = (v+1)>>1;
- // }
- //
- // s+= lx2;
- // d+= lx2;
- // }
- __asm
- { // yes average, fullX, fullY
- // each processes 8-pixels, 8 pixels are written per iteration
- // there is some minor software pipelining to reduce the
- // store latency
- mov edx, [lx2] ; edx <= lx2
- mov ecx, 0x00 ; ecx <= 0
-
- movd mm5, dword ptr [sSRLamt ]; // mm5 = right-shift src alignment
-
- movd mm6, dword ptr [sSLLamt ]; // mm6 = left-shift src alignment
-
- movq mm7, qword ptr [mmMaskFEFE]; mm7 <= 0xFEFEFEFEFE...
-
- mov esi, dword ptr [p_Alsrc ] ;esi <= aligned p_lsrc[]
-
- mov edi, dword ptr [d ] ;edi <= p_ldst[0]
-
- mov ebx, dword ptr [p_Aldst] ; ebx <= aligned p_ldst[]
-
- //srl ebx, 3 ; ebx <= (w >> 3) (width / 8)
- mov eax, [h] ; eax <= h
- jmp lp_afxfy; // skip the first store
-
- lp_afxfy_st: // outer loop for( y =0; y < H; ++y )
- movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3
-
- // begin inner loop for ( ecx=0; ecx < ebx; ecx += 8 )
- lp_afxfy : // Average, FullX/FullY
- // the read data operation is fairly invovled, to enforce QWORD
- // aligned reads
-
- movq mm0, qword ptr [ esi + ecx ] ;mm1 <= p_Alsrc[ 8..1 ]
- // movq mm3, mm0 ; mm3 <= Mask
- movq mm1, qword ptr [ esi + ecx +8] ;mm1 <= p_Alsrc[ 16..9 ]
- psrlq mm0, mm5; // align [8..1]
-
- movq mm2, qword ptr [ ebx + ecx ] ;mm2 <= p_Aldst[ 8..1 ]
- psllq mm1, mm6; // align [16..9]
-
- movq mm3, qword ptr [ ebx + ecx +8] ;mm2 <= p_Aldst[ 8..1 ]
- por mm0, mm1; // mm0 = desired pixels src[8..1]
-
- psrlq mm2, qword ptr [ dSRLamt ];
- movq mm1, mm0; // mm1 = copy of mm0, pixels src[8..1]
-
- psllq mm3, qword ptr [ dSLLamt ];
- pand mm0, mm7; // mm0 = MSB7 of src[8..1]
-
- por mm2, mm3; // mm2 = desired pixels dst[8..1]
- psrlw mm0, 1; // mm0 = src[8..1] / 2
-
- movq mm3, mm2; // mm3 = copy of mm2, pixels dst[8..1]
- pand mm2, mm7; // mm2 = MSB7 of dst[8..1]
-
- psrlw mm2, 1; // mm2 = dst[8..1] / 2
- por mm3, mm1; // mm3 = SRC[] OR DST[]
-
- pand mm3, qword ptr [mmMask0101]; // mm3 = "+1" rounding compensation
- paddusb mm2, mm0; // mm2 = (src+dst) / 2
-
- add ecx, 0x08;
- paddusb mm3, mm2; // mm3 = final result dst[8..1]
-
- cmp ecx, [w]; ; // compare i <> width
- // instead of storing result here, causing a stall, the store
- // is moved to the beginning of the loop (past the branch-compare)
-
- // movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3
-
- jb lp_afxfy_st; ; // if ( i < width ) loop
- sub eax, 0x01 ; j = j -1
-
- // since we moved the store *after* the branch, we need to duplicate
- // the store statement for each branch-possibility
- movq qword ptr [ edi + ecx - 8], mm3 ;p_ldst[ i ] <= mm3
- add esi, edx ;p_Alsrc += lx2
-
- add ebx, edx ;p_Aldst += lx2
- add edi, edx ;p_ldst += lx2
- mov ecx, 0x00000000 ; i = 0;
-
- cmp eax, 0 ; compare j <> 0
- jg lp_afxfy ; if ( j >= 0 ) loop
- } // end __asm, end of yes average, fullX, fullY
- }
- else if (!IS_XHALF && IS_YHALF) /* no horizontal but vertical half-pel */
- {
- // this MMX routine reads 4 x 2 pixels per iteration
- // this MMX routine outputs 4 pixels per iteration.
- // all reads (source and dest) are DWORD aligned
- __asm
- { // yes average, fullX, halfY
-
- // since the function processes only 4 pixels per iteration, we must
- // convert sSRLamt and dSRLamt to DWORD alignment values
- mov edx, dword ptr [sSRLamt ]; // left-shift src alignment
- mov eax, 0x00; // eax = x
-
- mov esi, dword ptr [s] ;//esi = source s[] (row y)
- and edx, 0x1F; // convert edx -> right-shift src alignment (DWORD)
-
- mov ebx, dword ptr [dSRLamt ]; // left-shift dst alignment
- and esi, 0xFFFFFFFC ; // esi = DWORD aligned source
-
- mov ecx, esi; // ecx = copy of esi
- and ebx, 0x1F; / convert ebx -> right-shift dst alignment (DWORD)
-
- movd mm4, dword ptr [d] ;//edi = destination d[]
- movd mm5, edx; // mm5 = right-shift src alignment (DWORD)
-
- add ecx, [lx]; // ecx = source s[] (row y+1)
- movd mm6, ebx; // mm6 = right-shift dest alignment (DWORD)
-
- mov edx, [lx2]; // edx = stride [lx2]
- pxor mm0, mm0; // mm0 = 0x0000_0000
-
- mov ebx, [h]; // ebx =[heihgt]
-
- movq mm7, qword ptr [mmMask0002]; // mm7 = 0x00020002_...
-
- jmp lp_afxhy; // skip over initial store
-
- lp_afxhya: // for ( y = h; y > 0; --y)
- movd dword ptr [ edi + eax - 4], mm3; // store dst[4..1]
-
- lp_afxhy : // for ( x = 0; x < w; x+= 4)
- // the read data operation is fairly invovled, to enforce QWORD
- // aligned reads
-
- movd mm1, dword ptr [esi + eax]; // read src[4..1] (y)
- movd edi, mm4;
-
- punpckldq mm1, dword ptr [esi + eax + 4]; // mm1 = src[8..1] (y)
- and edi, 0xFFFFFFFC; // edi = DWORD aligned dst[4..1]
-
- movd mm2, dword ptr [ecx + eax]; // read src[4..1] (y+1)
- psrlq mm1, mm5; // mm1 = desired input src[4..1] (y)
-
- punpckldq mm2, dword ptr [ecx + eax + 4]; // mm2 = src[8..1] (y+1)
-
- movd mm3, dword ptr [ edi + eax ]; // mm3 = dst[4..1]
- psrlq mm2, mm5; // mm2 = desired input src[4..1] (y+1)
-
- punpckldq mm3, dword ptr [ edi + eax + 4]; // mm3 = dst[8..1]
-
- punpcklbw mm1, mm0; // mm1 = input src[_4 _3 _2 _1] (y)
-
- punpcklbw mm2, mm0; // mm2 = input src[_4 _3 _2 _1 ] (y+1)
-
- psrlq mm3, mm6; // mm3 = desired input dst[4..1]
- paddusw mm1, mm7; // mm1 = "+2" src[4..1] (rounding compensation)
-
- // we now have all the desired inputs, mm1, mm2, mm3
-
- punpcklbw mm3, mm0; // mm3 = dst[_4 _3 _2 _1] (16-bit values)
- paddusw mm2, mm1; // mm2 = (src+src) (row y, y+1)
-
- movd edi, mm4; // restore copy
- psllw mm3, 0x01; // mm1 = (dst[ _4 _3 _2 _1] << 1 )
-
- paddusw mm3, mm2; // mm2 = [ src(y)+src(y+1)+ 2*dst ]
- add eax, 0x04; // x += 4
-
- cmp eax, [w]; // compare ( x <> w )
- psrlw mm3, 0x02; // mm3 = (src+src+2*dst)/4 (desired output)
-
- packuswb mm3, mm3; // mm3 = ready for output
-
- // this store has been relocated to beginning of loop to avoid stall
- // movd dword ptr [ edi + eax -4], mm3; // store dst[4..1]
-
- jl lp_afxhya; // end for ( x =0 ... )
-
- // since we moved store after the branch, we must replicate the store
- // for all possible outcomes of branch
- movd dword ptr [ edi + eax -4], mm3; // store dst[4..1]
- sub ebx, 0x01; // --y
-
- add esi, edx; // p_lsrc += lx2; row(y)
- add edi, edx; // p_ldst += lx2;
-
- add ecx, edx; // p_lsrc += lx2; row(y+1)
- mov eax, 0x00; // x = 0
-
- movd mm4, edi; // move edi => mm4 (backup copy of dest[] pointer )
- cmp ebx, 0x00; // compare y <> 0
-
- jg lp_afxhy ; // end for ( y = 0 ... )
-
- } // end __asm, end of yes average, fullX, halfY
-
- /*
- for (j=0; j<h; j++) // yes average, fullX, halfY
- {
- for (i=0; i<w; i++)
- {
- v = (((unsigned int)d[i])<<1) + ((unsigned int)(s[i]+s[i+lx]));
- //d[i]=(v+(v>=0?1:0))>>1;
- d[i]=(v+2)>>2;
- }
-
- s+= lx2;
- d+= lx2;
- }
- */
- }
- else if (IS_XHALF && !IS_YHALF) /* horizontal but no vertical half-pel */
- {
- // this MMX loop generates 8 output-pixels per iteration.
- // all reads are qword aligned
- __asm
- { // yes average, halfX, fullY
- mov edx, dword ptr [p_Aldst ]; // QWORD aligned dest[]
- mov eax, 0x00; // eax = x
-
- mov esi, dword ptr [p_Alsrc ]; // QWORD aligned src[]
- // pxor mm2, mm2; // mm2 = 0x0000_0000
-
- mov edi, dword ptr [d] ;//edi = destination d[]
-
- mov ebx, [h]; // ebx = y
-
- movd mm6, dword ptr [sSLLamt ]; // mm6 = left-shift src alignment
-
- movd mm7, dword ptr [sSRLamt ]; // mm7 = right-shift src alignment
- jmp lp_ahxfy; // skip first store operation
-
- lp_ahxfya: // for ( y = h; y > 0; --y )
- movq qword ptr [ edi + eax-8], mm1;
- lp_ahxfy : // for ( x = 0; x < w; x+= 8)
- // the read data operation is fairly involved, to enforce QWORD
- // aligned reads
-
- movq mm0, qword ptr [ esi + eax ]; // mm0 = p_Alsrc[8..1]
- // pxor mm2, mm2; // mm2 = 0x0000_0000
-
- movq mm1, qword ptr [ esi + eax + 8]; // mm1 = p_Alsrc[16..9]
- psrlq mm0, mm7; // sSRL amount
-
- // movq mm3, mm1; // copy mm1 to mm3
- movq mm2, mm1; // copy mm1 to mm2
- psllq mm1, mm6; // sSLL amount
-
- movq mm4, qword ptr [ edx + eax]; // mm4 = p_Aldst[8..1]
- psrlq mm2, mm7; // shift right ( "9th pixel src[xxxxxx9]")
-
- movq mm5, qword ptr [ edx + eax + 8]; // mm5 = p_Aldst[16..9]
- psllw mm2, 8; // mm2 = "[ x_ x_ | x_ 9_ ]"
- // punpcklbw mm2, mm3; // mm2 = src[12_ 11_ 10_ 9_], desired input
-
- psrlq mm4, qword ptr [ dSRLamt ]; // dSRL amount
- por mm0, mm1; // mm0 = desired input src[8..1]
-
- psllq mm5, qword ptr [ dSLLamt ]; // dSLL amount
- movq mm1, mm0; // mm1 = copy of desired input src[8..1]
-
- por mm5, mm4; // desired input mm5 = dst[8..1]
- psrlw mm0, 8; // mm0 = src[ _8 _6 _4 _2] (even pixels)
-
- // we have read all the desired inputs, mm0, mm5, mm2 (and mm1)
-
- pand mm1, [mmMask00FF]; // mm1 = src[ _7 _5 _3 _1] (odd pixels)
- psllq mm2, 40; // mm2 = src [ _9 _ _ _ ]
-
- paddusw mm0, [mmMask0002]; // mm0 = "+2" round-up adjustment
- movq mm4, mm1; // mm4 = copy of [src 7 5 3 1] (odd)
-
- psrlq mm4, 16; // mm4 = [ src _ 7 5 3 ]
- movq mm3, mm5; // mm3 = copy of dest[8..1]
-
- psrlw mm5, 0x08; // mm5 = dst[ _8 _6 _4 _2]
- paddusw mm1, mm0; // mm1 = src[8..0] even+odd pixels
-
- pand mm3, [mmMask00FF]; // mm3 = 2*dest[ _7 _5 _4 _1]
- psllw mm5, 0x01; // mm5 = 2*dst[ _8 _6 _4 _2]
-
- por mm2, mm4; // mm4 = src [ _9 _7 _5 _3]
- psllw mm3, 0x01; // mm3 = 2*dst[ _7 _5 _3 _1]
-
- // pixel alignment should look like this
- // from source [98 87 76 65 54 43 32 21]
- // from dest-> [ 8 7 6 5 4 3 2 1]
- // EV ODD EV O E O E O
-
- paddusw mm1, mm3; // mm1 = (src+src+2*dst) [ odd pixels ]
- paddusw mm2, mm5; // mm2 = (src+src+2*dst) [ even pixels ]
-
- psrlw mm1, 2; // mm1 = (src+src+2*dst)/4 (__87 __65 __43 __21)
- paddusw mm2, mm0; // mm2 = (src+src) (__98 __76 __54 __32)
-
- psrlw mm2, 2; // mm2 = (src+src)/4 [ __98 __76 __54 __32 ]
- add eax, 0x08;
-
- cmp eax, [ w ]; // compare x <> w
- psllw mm2, 8; // mm2 = (src+src)/4 [ 98__ 76__ 54__ 32__ ]
-
- por mm1, mm2; // mm1 = (src+src)/4 [ 98 87 76 65 54 43 21 ]
- // this store has been relocated to beginning of loop to avoid stall
- // movq qword ptr [ edi + eax-8], mm1;
- jl lp_ahxfya ; // end for ( x =0 ... )
-
- add esi, [lx2]; // p_lsrc += lx2;
- sub ebx, 0x01; // --y
-
- // since we moved store after the branch, we must replicate the store
- // for all possible outcomes of branch
- movq qword ptr [ edi + eax-8], mm1;
-
- add edi, [lx2]; // p_ldst += lx2;
- mov eax, 0x00; // x = 0
-
- add edx, [lx2]; // p_Alsrc += lx2;
- cmp ebx, 0x00; // compare y <> 0
-
- jg lp_ahxfy ; // end for ( y = height ... )
-
- } // end __asm , end of yes-average, halfX, fullY
- /*
- for (j=0; j<h; j++) // yes average, halfX, fullY
- {
- for (i=0; i<w; i++)
- {
- //v = d[i] + ((unsigned int)(s[i]+s[i+1]+1)>>1);
- v = (((unsigned int)d[i])<<1) + ((unsigned int)(s[i]+s[i+1]));
- //d[i] = (v+(v>=0?1:0))>>1;
- d[i] = (v+2)>>2;
- }
-
- s+= lx2;
- d+= lx2;
- }
- */
- }
- else /* if (xh && yh) horizontal and vertical half-pel */
- {
- // this MMX loop generates 4 output-pixels per iteration.
- // 8 bytes are read from src[] and dst[] each iteration, but
- // the upper 3 bytes are discarded.
- __asm
- { // yes average, halfX, halfY
- mov esi, dword ptr [p_Alsrc] ;//esi = aligned source p_Alsrc[]
- mov eax, 0x00; // eax = x
-
- movd mm5, dword ptr [sSLLamt ]; // mm5 = left-shift src alignment
-
- movd mm6, dword ptr [sSRLamt ]; // mm6 = right-shift src alignment
-
- mov edi, dword ptr [d] ;//edi = destination d[]
- mov ebx, 0x00; // ebx = y
-
- mov edx, dword ptr [p_Aldst] ;//edx = aligned dest p_Aldst[]
- mov ecx, [lx]; //vertical stride (for row y+1)
- pxor mm7, mm7; // mm7 = 0x0000_0000
-
- // movq mm6, qword ptr [mmMask0004]; // mm6 = 0x00040004_...
- jmp lp_ahxhy; // skip first store operation
-
- lp_ahxhya: // for ( y = 0; y < h; ++y )
- movd dword ptr [ edi + eax - 4], mm2;
-
- lp_ahxhy : // for ( x = 0; x < w; x+= 4)
- movq mm0, qword ptr [ esi + eax ]; // 0a) mm0 = p_Alsrc[8..1] row(y)
-
- movq mm2, qword ptr [ esi + eax + 8]; // 0a) mm2 = p_Alsrc[16..9] row(y)
- psrlq mm0, mm6; // shift right mm0
-
- movq mm1, qword ptr [ edx + eax ]; // 2a) mm1 = p_Aldst[8..1]
- add esi, ecx; // esi = src[] row(y+1)
-
- movq mm3, qword ptr [ edx + eax + 8 ]; // 2a) mm3 = p_Aldst[16..9]
- psllq mm2, mm5; // shift left mm2
-
- psrlq mm1, [ dSRLamt ];
- por mm0, mm2; //0a) mm0 = src[8..1] row(y) desired input
-
- psllq mm3, [ dSLLamt ];
- movq mm4, mm0; // 1b) mm4 = copy of src[8..1] row(y)
-
- movq mm2, qword ptr [ esi + eax ]; // 3a) mm2 = src[8..1] row(y+1)
- por mm1, mm3; // mm1 = dst[8..1] desired input
-
- movq mm3, qword ptr [ esi + eax +8]; // 3a) mm2 = src[16..9] row(y+1)
- psrlq mm2, mm6;
-
- paddusw mm4, [mmMask0004];
- psllq mm3, mm5;
-
- por mm2, mm3; // mm2 = src[7..1] row(y+1) desired input
- punpcklbw mm4, mm7; // 1c) mm4 = src [4..1] row(y)
-
- // 1*) mm4 = src [4..1] row(y) "+04" (rounding comp)
- psrlq mm0, 0x08; // 1d) mm0 = src [ _8765432 ] row(y)
- movq mm3, mm2; // 3b) mm3 = copy of src[8..1] row(y+1)
-
- punpcklbw mm0, mm7; // 1e) mm0 = src [5..2] row(y)
- add eax, 0x04; // x += 4;
-
- punpcklbw mm1, mm7; // 2b) mm1 = dst [4..1] row(y)
- paddusw mm0, mm4; // 1f) mm0 = (src+src) [54 43 32 21] row(y)
- // mm0 = partial result
-
- psllw mm1, 0x02; // 2c) mm1 = 4*dst[4..1]
- sub esi, ecx; // conver esi = p_Alsrc[] row(y), used to be row(y+1)
-
- paddusw mm0, mm1; // 2d) mm0 = (4*dst+src(y)), partial result
- punpcklbw mm3, mm7; // 3c) mm3 = src[4..1] row(y+1)
-
- psrlq mm2, 0x08; // 3d) mm2 = src[ _8765432 ] row(y+1)
- paddusw mm0, mm3; // mm0 = (4*dst+src(y)+(even)src(y+1)) partial result
-
- punpcklbw mm2, mm7; // 3e) mm2 = src[5..2] row(y+1)
- ;//slot
-
- paddusw mm2, mm0 // 5) mm2 = (4*dst+src(y)+src(y+1), almost final result
- ;//slot
-
- cmp eax, [w]
- psrlw mm2, 0x03; // 6) mm2 = [ 4...2 ] / 8 average
-
- packuswb mm2, mm2; // 6) mm0 = final result, ready to store
-
- // the store has been moved to AFTER the loop, to avoid stall
- // movd dword ptr [ edi + eax ], mm2;
-
- jl lp_ahxhya; // end for ( x = 0; x< w; x+=4 );
-
- // 1 copy of the store is here, because of our software pipelining
- add esi, [lx2];
- add ebx, 0x01; // ++y;
-
- movd dword ptr [ edi + eax - 4], mm2;
-
- add edi, [lx2];
- mov eax, 0x00; // x = 0;
-
- add edx, [lx2];
-
- cmp ebx, [h]
- jl lp_ahxhy; // end for ( y = 0; y < h; ++y )
-
- } // end __asm , end of yes average, halfX, halfy
-
- ;
- /*
- for (j=0; j<h; j++) // yes average, halfX, halfY
- {
- for (i=0; i<w; i++)
- {
- //v = d[i] + ((unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2);
- v = (((unsigned int)d[i])<<2) +
- ((unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]));
- //d[i] = (v+(v>=0?1:0))>>1;
- d[i] = (v+4)>>3;
- }
-
- s+= lx2;
- d+= lx2;
- }*/
-
- }
- }
- else
- { /* -------- NO AVERAGE -------- */
-
- if (!IS_XHALF && !IS_YHALF) /* no horizontal nor vertical half-pel */
- {
- // for (j=0; j<h; j++) /* no average, fullX, fullY */
- // {
- // for (i=0; i<(w>>2); ++i)
- // {
- // p_ldst[i] = p_lsrc[i];
- // }
- //
- // p_lsrc += ( lx2 >> 2 );
- // p_ldst += ( lx2 >> 2 );
- // }
-
- __asm
- { // no average, fullX, fullY
- // processes 8*2 pixels per iteration (two rows, 8-pixels per row)
- // all reads (source and dest) are QWORD aligned
-
- movd mm6, [sSRLamt]; // mm6 =right-shift amount to QW align source
- mov ecx, 0x00; // ecx = x
-
- mov edi, dword ptr [d ] ;//edi <= p_ldst[0]
-
- mov ebx, [lx2] ; // ebx <= lx2
-
- mov esi, dword ptr [p_Alsrc ] ;esi <= p_lsrc[0]
- mov edx, ebx ; //edx <= lx2
- //srl ebx, 3 ; ebx <= (w >> 3) (width / 8)
- mov eax, [h] ; //eax <= h
- add edx, edi ; //edx <= p_ldst[] row(y+1)
-
- movd mm7, [sSLLamt]; // mm7 =left-shift amount to QW align source
- add ebx, esi ; //ebx <= p_lsrc[] row(y+1)
-
- jmp lp_fxfy; // skip intial store
-
- lp_fxfy_st2:
- movq qword ptr [ edi + ecx -8], mm0 ;dst[ i ] <= mm1
-
- movq qword ptr [ edx + ecx -8], mm2 ;dst[ i ] <= mm1
-
- // begin inner loop for ( ecx=0; ecx < [w]; ecx += 8 )
- lp_fxfy : // no average, FullX/FullY
-
- movq mm0, qword ptr [ esi + ecx ] ;mm0 <= p_Alsrc[ i ] (row y)
-
- movq mm1, qword ptr [ esi + ecx + 8] ;mm0 <= p_Alsrc[ i ] (row y)
- psrlq mm0, mm6; // right shift [7..0] to align
-
- movq mm2, qword ptr [ ebx + ecx ] ;mm1 <= p_Alsrc[i] (row y+1)
- psllq mm1, mm7; // left shift [15..8] to align
-
- movq mm3, qword ptr [ ebx + ecx + 8] ;mm1 <= p_Alsrc[i] (row y+1)
- psrlq mm2, mm6; // right shift [7..0] to align
-
- por mm0, mm1; // mm0 = desired src[] 8-pixels (row y)
- psllq mm3, mm7; // left shift [15..8] to align
-
- por mm2, mm3; // mm2 = desired src[] 8-pixels (row y+1)
- add ecx, 0x08;
-
- // software pipelining, store-operation has been moved AFTER branch.
-
- cmp ecx, [w]; ; // compare i <> width
- jb lp_fxfy_st2; ; // if ( i < width ) loop
-
- // increment src/dst by two rows ( 2*lx2)
- movq qword ptr [ edi + ecx -8], mm0 ;p_ldst[ i ] <= mm1
-
- add esi, [lx2_x2] ;//p_Alsrc += lx2 ( row y)
-
- movq qword ptr [ edx + ecx -8], mm2 ;p_ldst[ i ] <= mm1
-
- add ebx, [lx2_x2] ;//p_Alsrc += lx2 ( row y+1)
- sub eax, 2 ;// j = j -2
-
- add edi, [lx2_x2] ;//p_ldst += lx2 ( row y)
- mov ecx, 0x00 ;// i = 0;
-
- add edx, [lx2_x2] ;//p_ldst += lx2 ( row y+1)
- cmp eax, 0 ; compare j <> 0
-
- jg lp_fxfy ; if ( j > 0 ) loop
- } // end __asm , end of no average, fullX, fullY
- }
- else if (!IS_XHALF && IS_YHALF) /* no horizontal but vertical half-pel */
- {
- // this MMX routine produces 8 pixels per iteration.
- // all source reads are qword aligned
- __asm
- { // no average, fullX, halfY
- mov esi, dword ptr [p_Alsrc] ;//esi = aligned source s[] (row y)
- mov eax, 0x00; // eax = x
-
- movq mm7, qword ptr [mmMask0101]; // mm7 = 0x01010101_...
- mov ecx, esi; // ecx = copy of esi
-
- movq mm4, qword ptr [mmMaskFEFE]; // mm4 = 0xFEFEFEFE_...
- mov edi, dword ptr [d] ;//edi = destination d[]
- mov ebx, 0x00; // ebx = y
-
- add ecx, [lx]; // ecx = source s[] (row y+1)
-
- mov edx, [lx2]; // edx = stride [lx2]
-
- movd mm5, [sSRLamt]; // mm5=source right shift amount to align
-
- movd mm6, [sSLLamt]; // mm6=source left shift amount to align
-
- jmp lp_fxhy; // skip over initial store
-
- lp_fxhya: // for ( y = 0; y < h; ++y)
- movq qword ptr [ edi + eax - 8], mm3; // store dst[8..1]
- pxor mm2,mm2; // mm2 = 0x0000_0000
- lp_fxhy : // for ( x = 0; x < w; x+= 8)
- movq mm0, qword ptr [ esi + eax ]; // mm0 = src[8..1] (y)
-
- movq mm1, qword ptr [ esi + eax + 8 ]; // mm1 = src[16..9] (y)
- psrlq mm0, mm5; // align src[8..1] (y)
-
- movq mm2, qword ptr [ ecx + eax ]; // mm2 = src[8..1] (y+1)
- psllq mm1, mm6; // align src[16..9] (y)
-
- movq mm3, qword ptr [ ecx + eax +8]; // mm3 = src[16..9] (y+1)
- psrlq mm2, mm5; // align src[8..1] (y+1)
-
- psllq mm3, mm6; // align src[16..9] (y+1)
- por mm0, mm1; // mm0 = desired pixels, src[ 8..1] (y)
-
- movq mm1, mm0; // mm1 = copy of mm0, src[ 8..1] (y)
- por mm2, mm3; // mm2 = desired pixels, src[8..1] (y+1)
-
- movq mm3, mm2; // mm3 = copy of mm2, src[ 8..1] (y+1)
- por mm2, mm0; // mm2 = src(y) OR src(y+1)
-
- pand mm1, mm4; // mm1 = MSB7 of src[8..1] (y)
- pand mm2, mm7; // mm2 = "+1" src[8..1] (round adjustment)
-
- pand mm3, mm4; // mm3 = MSB7 of src[8..1] (y+1)
- psrlw mm1, 1; // mm1 = src[8..1] / 2 (y)
-
- psrlw mm3, 1; // mm3 = src[8..1] / 2 (y+1)
- paddusb mm2, mm1; // mm2 = partial result
-
- paddusb mm3, mm2; // mm3 = (src+src) src[8..1] final result
- add eax, 0x08; // x += 8
-
- cmp eax, [w]; // compare x <> w
-
- // this store has been relocated to beginning of loop to avoid stall
- // movd dword ptr [ edi + eax -8], mm2; // store dst[8..1]
-
- jl lp_fxhya; // end for ( x =0 ... )
-
- // since we moved store after the branch, we must replicate the store
- // for all possible outcomes of branch
- movq qword ptr [ edi + eax -8], mm3; // store dst[8..1]
- add ebx, 0x01; // ++y
-
- add esi, edx; // p_lsrc += lx2; row(y)
- pxor mm2,mm2; // mm2 = 0x0000_0000
-
- add edi, edx; // p_ldst += lx2;
- mov eax, 0x00; // x = 0
-
- add ecx, edx; // p_lsrc += lx2; row(y+1)
-
- cmp ebx, [ h ]; // compare y <> height
- jl lp_fxhy ; // end for ( y = 0 ... )
-
- } // end __asm , end of no average, fullX, halfY
-
- /*
- for (j=0; j<h; j++) // no average, fullX, halfY
- {
- for (i=0; i<w; i++)
- {
- d[i] = (unsigned int)(s[i]+s[i+lx]+1)>>1;
- }
-
- s+= lx2;
- d+= lx2;
- }
- */
- }
- else if (IS_XHALF && !IS_YHALF) /* horizontal but no vertical half-pel */
- {
-
- // this MMX loop generates 8 output-pixels per iteration.
- // all reads are QWORD aligned
- __asm
- { /* no average, halfX, fullY */
- mov esi, dword ptr [p_Alsrc ] ;//esi = aligned source s[]
- mov eax, 0x00; // eax = x
-
- movd mm5, [sSRLamt]; // mm5 = src shift-right amount...
-
- movd mm6, [sSLLamt]; // mm6 = src shift-left amount...
-
- movd mm4, [sSLLamt_minus_8]; // mm4 = (mm6 - 8)
-
- mov edi, dword ptr [d] ;//edi = destination d[]
-
- mov ebx, [h]; // ebx = y
-
- movq mm7, qword ptr [mmMask0101]; // mm7 = 0x01010101_...
- jmp lp_hxfy; // skip first store operation
-
- lp_hxfya: // for ( y = 0; y < h; ++y )
- movq qword ptr [ edi + eax-8], mm3;
-
- lp_hxfy : // for ( x = 0; x < w; x+= 8)
- movq mm0, qword ptr [ esi + eax ]; // mm0 = p_Alsrc[8..1]
-
- movq mm1, qword ptr [ esi + eax + 8]; // mm0 = p_Alsrc[16..9]
- psrlq mm0, mm5; // align src[8..1] (y)
-
- movq mm3, mm1; // copy mm3 = p_Alsrc[16..9]
- psllq mm1, mm6; // align src[16..] (y)
-
- movq mm2, mm0; // copy mm2 = src[ 8..1] (post shift)
- psllq mm3, mm4; // mm3 = src[16..9]
-
- psrlq mm2, 8; // mm2 = src[ _ 8..2] (top byte 00'd)
- por mm0, mm1; // mm0 = desired data src[8..1]
-
- por mm2, mm3; // mm2 = desired data src[9..2]
- movq mm1, mm0; // mm1 = copy of mm0, src[8..1]
-
- movq mm3, mm2; // mm3 = copy of mm2, src[9..2]
- por mm2, mm0; // mm0 = LSBits of src[ 8..1 ]
-
- pand mm1, qword ptr [mmMaskFEFE]; // mm1 = MSB7 of src[8..1]
- pand mm2, mm7; // mm2 = "+1" round compensation
-
- pand mm3, qword ptr [mmMaskFEFE]; // mm3 = MSB7 of src[9..2]
- psrlw mm1, 1; // mm1 = 1/2 src[8..1]
-
- paddusb mm2, mm1; // mm2 = partial result
- psrlw mm3, 1; // mm3 = 1/2 src[9..2]
-
- paddusb mm3, mm2; // mm3 = final result (src+src) [98 87 76 65 54 43 32 21]
- add eax, 0x08;
-
- cmp eax, [ w ]; // compare x <> w
- jl lp_hxfya ; // end for ( x =0 ... )
-
- add esi, [lx2]; // p_lsrc += lx2;
- sub ebx, 0x01; // ++y
-
- // since we moved store after the branch, we must replicate the store
- // for all possible outcomes of branch
- movq qword ptr [ edi + eax-8], mm3;
- mov eax, 0x00; // x = 0
-
- add edi, [lx2]; // p_ldst += lx2;
-
- cmp ebx, 0; // compare y <> 0
- jg lp_hxfy ; // end for ( y = 0 ... )
-
- } // end __asm
-
-
- /*
- for (j=0; j<h; j++) // no average, halfX, fullY
- {
- for (i=0; i<w; i++)
- {
- d[i] = (unsigned int)(s[i]+s[i+1]+1)>>1;
- }
-
- s+= lx2;
- d+= lx2;
- }
- */
- }
- else /* if (xh && yh) horizontal and vertical half-pel */
- {
- // this MMX loop generates 4 output-pixels per iteration.
- // 8 bytes are read from src[] and dst[] each iteration, but
- // the upper bytes are discarded.
- // the reads are QWORD aligned
- __asm
- { /* no average, halfX, halfy*/
- mov esi, dword ptr [p_Alsrc];//esi = aligned source row(y)
- mov eax, 0x00; // eax = x
-
- mov edi, dword ptr [d] ;//edi = destination d[]
- mov ebx, 0x00; // ebx = y
-
- mov edx, [lx2]; // edx = stride [lx2]
- mov ecx, esi; // copy of esi
-
- add ecx, [lx]; // ecx = source s[] row(y+1)
- pxor mm7, mm7; // mm7 = 0x0000_0000
-
- movd mm5, [sSRLamt]; // mm5 = src shift-right amount...
-
- movd mm6, [sSLLamt]; // mm6 = src shift-left amount...
-
- // movq mm6, qword ptr [mmMask0002]; // mm6 = 0x00020002_..
- jmp lp_hxhy; // skip first store operation
-
- lp_hxhya: // for ( y = 0; y < h; ++y )
- movd dword ptr [ edi + eax - 4], mm2;
-
- lp_hxhy : // for ( x = 0; x < w; x+= 4)
- movq mm0, qword ptr [ esi + eax ]; // 1a) mm0 = src[8..1] row(y)
-
- movq mm1, qword ptr [ esi + eax +8]; // 1a) mm0 = src[16..9] row(y)
- psrlq mm0, mm5; // align src[8..1] row(y)
-
- movq mm2, qword ptr [ ecx + eax ]; // 3a) mm2 = src[8..1] row(y+1)
- psllq mm1, mm6; // align src[16..9] row(y)
-
- movq mm3, qword ptr [ ecx + eax +8]; // 3a) mm2 = src[16..9] row(y+1)
- psrlq mm2, mm5; // align src[8..1] row(y+1)
-
- por mm0, mm1; // mm0 = desired pixels src[8..1] row(y)
- psllq mm3, mm6; // align src[16..9] row(y+1)
-
- por mm2, mm3; // mm2 = desired pixels src[8..1] row(y+1)
- movq mm4, mm0; // 1b) mm4 = copy of src[8..1] row(y)
-
- punpcklbw mm4, mm7; // 1c) mm4 = src [4..1] row(y)
- add eax, 0x04; // x += 4;
-
- paddusw mm4, [mmMask0002]; // 1*) mm4 = src [4..1] row(y) "+02" (rounding comp)
- psrlq mm0, 0x08; // 1d) mm0 = src [ _8765432 ] row(y)
-
- punpcklbw mm0, mm7; // 1e) mm0 = src [5..2] row(y)
- movq mm3, mm2; // 3b) mm3 = copy of src[8..1] row(y+1)
-
- punpcklbw mm3, mm7; // 3c) mm3 = src[4..1] row(y+1)
- paddusw mm0, mm4; // 1f) mm0 = (src+src) [54 43 32 21] row(y)
- // mm0 = partial result
-
- psrlq mm2, 0x08; // 3d) mm2 = src[ _8765432 ] row(y+1)
- paddusw mm3, mm0; // partial result mm3 = src(y) + (even)src(y+1)
-
- punpcklbw mm2, mm7; // 3e) mm2 = src[5..2] row(y+1)
- ;//slot
-
- paddusw mm2, mm3 ;// 5) mm2 = (src(y)+src(y+1), partial result
- ;//slot
-
- psrlw mm2, 0x02; // mm2 = (src+src)/4 average, 16-bit, almost final
- ;//slot
-
- packuswb mm2, mm2; // 6) mm2 = 8-bit pix final result, ready to store
- cmp eax, [w]
-
- // the store has been moved to AFTER the loop, to avoid stall
- // movd dword ptr [ edi + eax ], mm2;
-
- jl lp_hxhya; // end for ( x = 0; x< w; x+=4 );
-
- // 1 copy of the store is here, because of our software pipelining
- movd dword ptr [ edi + eax - 4], mm2;
- add ebx, 0x01; // ++y;
-
- add esi, edx; // src(row y) += lx2
- mov eax, 0x00; // x = 0;
-
- add ecx, edx; // src(row y+1) += lx2
- add edi, edx; // dst(row y) += lx2
-
- cmp ebx, [h]
- jl lp_hxhy; // end for ( y = 0; y < h; ++y )
-
- } // end __asm
- /*
- for (j=0; j<h; j++) // no average, halfX, halfY
- {
- for (i=0; i<w; i++)
- {
- d[i] = (unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2;
- }
-
- s+= lx2;
- d+= lx2;
- }
- */
- }
- }
-
- __asm emms;
- #endif
- } // form_component_prediction()
-
-