diff --git a/src/smi.h b/src/smi.h
index 0f72db9..d2b54ed 100644
--- a/src/smi.h
+++ b/src/smi.h
@@ -256,6 +256,10 @@ typedef struct
     int			lcdWidth;	/* LCD width */
     int			lcdHeight;	/* LCD height */
 
+    /* FastXv */
+    Bool		FastXv;
+    int			FastXvLevel;
+
     /* XvExtension */
     int			videoKey;	/* Video chroma key */
     Bool		ByteSwap;	/* Byte swap for ZV port */
diff --git a/src/smi_driver.c b/src/smi_driver.c
index 9c10e46..1aa2624 100644
--- a/src/smi_driver.c
+++ b/src/smi_driver.c
@@ -173,6 +173,9 @@ typedef enum
     OPTION_PANEL_SIZE,
     OPTION_USE_FBDEV,
     OPTION_CSCVIDEO,
+    /* Add for fast yuv packed for Xv */
+    OPTION_FASTXV,
+    OPTION_FASTXVLEVEL,
     NUMBER_OF_OPTIONS
 } SMIOpts;
 
@@ -196,6 +199,8 @@ static const OptionInfoRec SMIOptions[] =
     { OPTION_PANEL_SIZE,     "PanelSize",	  OPTV_ANYSTR,	{0}, FALSE },
     { OPTION_USE_FBDEV,	     "UseFBDev",	  OPTV_BOOLEAN,	{0}, FALSE },
     { OPTION_CSCVIDEO,	     "CSCVideo",	  OPTV_BOOLEAN, {0}, TRUE },
+    { OPTION_FASTXV,	     "FastXv",        OPTV_BOOLEAN, {0}, TRUE },
+    { OPTION_FASTXVLEVEL,    "FastXvLevel",   OPTV_INTEGER, {0}, FALSE },
     { -1,		     NULL,		  OPTV_NONE,	{0}, FALSE }
 };
 
@@ -773,6 +778,33 @@ SMI_PreInit(ScrnInfoPtr pScrn, int flags)
 		   pSmi->CSCVideo ? "en" : "dis");
     }
 
+	if (xf86GetOptValBool (pSmi->Options, OPTION_FASTXV, &pSmi->FastXv))
+	{
+		xf86DrvMsg (pScrn->scrnIndex, X_CONFIG, "Option: FastXv %d.\n",
+					pSmi->FastXv);
+	}
+	else
+	{
+		pSmi->FastXv = TRUE;
+	}
+
+	if (xf86GetOptValInteger (pSmi->Options, OPTION_FASTXVLEVEL, &pSmi->FastXvLevel))
+	{
+		if (pSmi->FastXv == FALSE)
+		{
+			xf86DrvMsg (pScrn->scrnIndex, X_CONFIG, "Option: FastXvLevel omitted.\n");
+		}
+		else
+		{
+			xf86DrvMsg (pScrn->scrnIndex, X_CONFIG, "Option: FastXvLevel %d.\n",
+						pSmi->FastXvLevel);
+		}
+	}
+	else
+	{
+		pSmi->FastXvLevel = 1;
+	}
+
     SMI_MapMmio(pScrn);
     SMI_DetectMem(pScrn);
     SMI_MapMem(pScrn);
diff --git a/src/smi_video.c b/src/smi_video.c
index b4511b8..b0c02f2 100644
--- a/src/smi_video.c
+++ b/src/smi_video.c
@@ -53,6 +53,8 @@ authorization from the XFree86 Project and silicon Motion.
 #include "config.h"
 #endif
 
+#include <errno.h>
+
 #include "smi.h"
 #include "smi_video.h"
 
@@ -276,6 +276,7 @@
     XVIMAGE_YUY2,
     XVIMAGE_YV12,
     XVIMAGE_I420,
+    XVIMAGE_UYVY,
     {
 	FOURCC_RV15,			/* id				*/
 	XvRGB,				/* type				*/
@@ -499,6 +501,25 @@ static I2CByte SAA7111InitData[] =
 
 };
 
+/* FastXv */
+#define FASTXV_COUNT 20
+
+typedef struct jiffy_counts_t
+{
+	unsigned long long usr,nic,sys,idle,iowait,irq,softirq,steal;
+	unsigned long long total;
+	unsigned long long busy;
+} jiffy_counts_t;
+
+static struct jiffy_counts_t pre_jif={0}, jif={0};
+static int cpu_idle = 1;
+static int cpu_busy = 0;
+static int halfpack = 0;
+static int flag_save_stage = 0;
+static int count = 0;
+static int fast_level = 0;
+static int yuv_cache[125000] __attribute__((aligned(8)));
+
 
 /**************************************************************************/
 
@@ -897,6 +919,8 @@ SMI_SetupVideo(ScreenPtr pScreen)
     ptrAdaptor->PutImage = SMI_PutImage;
     ptrAdaptor->QueryImageAttributes = SMI_QueryImageAttributes;
 
+	fast_level = pSmi->FastXvLevel;
+
     smiPortPtr->Attribute[XV_COLORKEY] = pSmi->videoKey;
     smiPortPtr->Attribute[XV_INTERLACED] = pSmi->interlaced;
     smiPortPtr->videoStatus = 0;
@@ -1017,7 +1041,7 @@ SMI_PutVideo(
     BoxRec dstBox;
     INT32 x1, y1, x2, y2;
     int norm;
-    int size, width, height, fbPitch;
+    int size, width, height;
     int top, left;
     xf86CrtcConfigPtr crtcConf = XF86_CRTC_CONFIG_PTR(pScrn);
     xf86CrtcPtr crtc;
@@ -1141,8 +1165,6 @@ SMI_PutVideo(
     if (pSmi->ByteSwap)
 	cpr00 |= 0x00004000;
 
-    fbPitch = (pScrn->displayWidth * pSmi->Bpp + 15) & ~15;
-
     if (vid_w <= drw_w) {
 	xscale = (256 * vid_w / drw_w) & 0xFF;
     } else if (vid_w / 2 <= drw_w) {
@@ -1217,8 +1239,6 @@ SMI_PutVideo(
     DEBUG("xscale==%d yscale=%d width=%d height=%d\n",
 	  xscale, yscale, width, height);
 
-    /* aaa whats this                     ----------------------v ?
-    vid_address = (pPort->area->box.y1 * fbPitch) + ((y1 >> 16) * vid_pitch);*/
     vid_address = pPort->video_offset;
 
     DEBUG("test RegionsEqual\n");
@@ -1463,6 +1483,652 @@ SMI_QueryBestSize(
 }
 
 
+/* FastXv */
+static void get_jiffy_counts(struct jiffy_counts_t *jif)
+{
+	FILE *fp = NULL;
+
+	fp = fopen("/proc/stat", "r");
+	if (fscanf(fp, "cpu  %lld %lld %lld %lld %lld %lld %lld %lld",
+				&jif->usr, &jif->nic, &jif->sys, &jif->idle,
+				&jif->iowait, &jif->irq, &jif->softirq, &jif->steal) < 4)
+	{
+		xf86DrvMsg(0, X_INFO, " Error to scanf %p. %s\n",
+					fp, strerror(errno));
+	}
+	fclose(fp);
+	jif->total = jif->usr + jif->nic + jif->sys + jif->idle
+		+ jif->iowait + jif->irq + jif->softirq + jif->steal;
+	/* procps 2->x does not count iowait as busy time */
+	jif->busy = jif->total - jif->idle - jif->iowait;
+}
+
+typedef void (* fastxv)(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h);
+
+static void pack_loongson_MMX(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h)
+{
+	int i, j;
+	unsigned char *y, *u, *v;
+	int dstinc, yinc, uinc, vinc;
+
+	y = srcy;
+	u = srcu;
+	v = srcv;
+
+	dstinc = dstPitch - 2*w;
+	yinc = srcPitchy - w;
+	uinc = srcPitchuv - w/2;
+	vinc = srcPitchuv - w/2;
+
+	for (i = 0; i < h; i++)
+	{
+		asm (
+			".set arch=loongson2f\n\t"
+			".set noreorder\n\t"
+			"move $8, %8 \n\t"
+			"1: \n\t"
+			"beqz $8, 2f \n\t"
+			"xor $f0, $f0, $f0 \n\t"
+			"ldc1 $f4, (%0) \n\t"
+			"punpcklbh $f2, $f4, $f0 \n\t"
+			"punpckhbh $f4, $f4, $f0 \n\t"
+			"ldc1 $f16, 8(%0) \n\t"
+			"punpcklbh $f14, $f16, $f0 \n\t"
+			"punpckhbh $f16, $f16, $f0 \n\t"
+			
+			"lwc1 $f8, (%1) \n\t"
+			"lwc1 $f12, (%2) \n\t"
+			"punpcklbh $f8, $f8, $f12 \n\t"
+			"punpcklbh $f6, $f0, $f8 \n\t"
+			"punpckhbh $f8, $f0, $f8 \n\t"
+			"lwc1 $f18, 4(%1) \n\t"
+			"lwc1 $f12, 4(%2) \n\t"
+			"punpcklbh $f18, $f18, $f12 \n\t"
+			"punpcklbh $f10, $f0, $f18 \n\t"
+			"punpckhbh $f12, $f0, $f18 \n\t"
+
+			"or $f2, $f2, $f6 \n\t"
+			"or $f4, $f4, $f8 \n\t"
+			"or $f14, $f14, $f10 \n\t"
+			"or $f16, $f16, $f12 \n\t"
+
+			"sdc1 $f2, (%3) \n\t"
+			"sdc1 $f4, 8(%3) \n\t"
+			"add %0, 16 \n\t"
+			"add %1, 8 \n\t"
+			"add %2, 8 \n\t"
+			"sdc1 $f14, 0x10(%3) \n\t"
+			"sdc1 $f16, 0x18(%3) \n\t"
+			"add $8, -1 \n\t"
+			"b 1b \n\t"
+			"add %3, 32 \n\t"
+			"2: \n\t"
+			".set reorder\n\t"
+			: "=r" (y), "=r" (u), "=r" (v), "=r" (dst)
+			: "0" (y), "1" (u), "2" (v), "3" (dst), "r" (w>>4)
+			: "memory","$8"
+		);
+
+		asm (
+			".set arch=loongson2f\n\t"
+			".set noreorder\n\t"
+			"move $8, %8 \n\t"
+			"1: \n\t"
+			"beqz $8, 2f \n\t"
+			"xor $f0, $f0, $f0 \n\t"
+			"ldc1 $f4, (%0) \n\t"
+			"punpcklbh $f2, $f4, $f0 \n\t"
+			"punpckhbh $f4, $f4, $f0 \n\t"
+			
+			"lwc1 $f8, (%1) \n\t"
+			"lwc1 $f12, (%2) \n\t"
+			"punpcklbh $f8, $f8, $f12 \n\t"
+			"punpcklbh $f6, $f0, $f8 \n\t"
+			"punpckhbh $f8, $f0, $f8 \n\t"
+
+			"or $f2, $f2, $f6 \n\t"
+			"or $f4, $f2, $f8 \n\t"
+
+			"sdc1 $f2, (%3) \n\t"
+			"sdc1 $f4, 8(%3) \n\t"
+			"add %0, 8 \n\t"
+			"add %1, 4 \n\t"
+			"add %2, 4 \n\t"
+			"add $8, -1 \n\t"
+			"b 1b \n\t"
+			"add %3, 16 \n\t"
+			"2:\n\t"
+			".set reorder\n\t"
+			: "=r" (y), "=r" (u), "=r" (v), "=r" (dst)
+			: "0" (y), "1" (u), "2" (v), "3" (dst), "r" ((w&0xf)/8)
+			: "memory","$8"
+		);
+
+		for (j = (w&7)/2; j; j--)
+		{
+			*dst++ = *y++;
+			*dst++ = *u++;
+			*dst++ = *y++;
+			*dst++ = *v++;
+		}
+		y += yinc;
+		u = (i%2) ? (u + uinc): (u - w/2);
+		v = (i%2) ? (v + vinc): (v - w/2);
+		dst += dstinc;
+	}
+}
+
+static void halfpack_loongson_MMX(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h)
+{
+	int j;
+	unsigned char *y1, *y2, *u, *v;
+	int dstinc, yinc, uinc, vinc;
+
+	y1 = srcy;
+	y2 = srcy + srcPitchy;
+	u = srcu;
+	v = srcv;
+
+	dstinc = dstPitch - 2*w;
+	yinc = 2*srcPitchy - w;
+	uinc = srcPitchuv - w/2;
+	vinc = srcPitchuv - w/2;
+
+	for (h/=2; h; h--)
+	{
+		asm (
+			".set arch=loongson2f\n\t"
+			".set noreorder\n\t"
+			"move $8, %10 \n\t"
+			"1: \n\t"
+			"beqz $8, 2f \n\t"
+			"xor $f0, $f0, $f0 \n\t"
+			"ldc1 $f4, (%0) \n\t"
+			"ldc1 $f8, (%1) \n\t"
+			"pavgb $f4, $f4, $f8 \n\t"
+			"punpcklbh $f2, $f4, $f0 \n\t"
+			"punpckhbh $f4, $f4, $f0 \n\t"
+			"ldc1 $f16, 8(%0) \n\t"
+			"ldc1 $f14, 8(%1) \n\t"
+			"pavgb $f16, $f16, $f14 \n\t"
+			"punpcklbh $f14, $f16, $f0 \n\t"
+			"punpckhbh $f16, $f16, $f0 \n\t"
+			
+			"lwc1 $f8, (%2) \n\t"
+			"lwc1 $f12, (%3) \n\t"
+			"punpcklbh $f8, $f8, $f12 \n\t"
+			"punpcklbh $f6, $f0, $f8 \n\t"
+			"punpckhbh $f8, $f0, $f8 \n\t"
+			"lwc1 $f18, 4(%2) \n\t"
+			"lwc1 $f12, 4(%3) \n\t"
+			"punpcklbh $f18, $f18, $f12 \n\t"
+			"punpcklbh $f10, $f0, $f18 \n\t"
+			"punpckhbh $f12, $f0, $f18 \n\t"
+
+			"or $f2, $f2, $f6 \n\t"
+			"or $f4, $f4, $f8 \n\t"
+			"or $f14, $f14, $f10 \n\t"
+			"or $f16, $f16, $f12 \n\t"
+
+			"sdc1 $f2, (%4) \n\t"
+			"sdc1 $f4, 8(%4) \n\t"
+			"add %0, 16 \n\t"
+			"add %1, 16 \n\t"
+			"add %2, 8 \n\t"
+			"add %3, 8 \n\t"
+			"sdc1 $f14, 0x10(%4) \n\t"
+			"sdc1 $f16, 0x18(%4) \n\t"
+			"add $8, -1 \n\t"
+			"b 1b \n\t"
+			"add %4, 32 \n\t"
+			"2: \n\t"
+			".set reorder\n\t"
+			: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst)
+			: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "r" (w>>4)
+			: "memory","$8"
+		);
+
+		asm (
+			".set arch=loongson2f\n\t"
+			".set noreorder\n\t"
+			"move $8, %10 \n\t"
+			"1: \n\t"
+			"beqz $8, 2f \n\t"
+			"xor $f0, $f0, $f0 \n\t"
+			"ldc1 $f4, (%0) \n\t"
+			"ldc1 $f8, (%1) \n\t"
+			"pavgb $f4, $f4, $f8 \n\t"
+			"punpcklbh $f2, $f4, $f0 \n\t"
+			"punpckhbh $f4, $f4, $f0 \n\t"
+			
+			"lwc1 $f8, (%2) \n\t"
+			"lwc1 $f12, (%3) \n\t"
+			"punpcklbh $f8, $f8, $f12 \n\t"
+			"punpcklbh $f6, $f0, $f8 \n\t"
+			"punpckhbh $f8, $f0, $f8 \n\t"
+
+			"or $f2, $f2, $f6 \n\t"
+			"or $f4, $f4, $f8 \n\t"
+
+			"sdc1 $f2, (%4) \n\t"
+			"sdc1 $f4, 8(%4) \n\t"
+			"add %0, 8 \n\t"
+			"add %1, 8 \n\t"
+			"add %2, 4 \n\t"
+			"add %3, 4 \n\t"
+			"add $8, -1 \n\t"
+			"b 1b \n\t"
+			"add %4, 16 \n\t"
+			"2:\n\t"
+			".set reorder\n\t"
+			: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst)
+			: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "r" ((w&0xf)/8)
+			: "memory","$8"
+		);
+
+		for (j = (w&7)/2; j; j--)
+		{
+			*dst++ = (*y1++ + *y2++)/2;
+			*dst++ = *u++;
+			*dst++ = (*y1++ + *y2++)/2;
+			*dst++ = *v++;
+		}
+		y1 += yinc;
+		y2 += yinc;
+		u += uinc;
+		v += vinc;
+		dst += dstinc;
+	}
+}
+
+static void halfpack_loongson_MMX_Avg(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h)
+{
+	int j;
+	unsigned char *y1, *y2, *u, *v, *yuv;
+	int dstinc, yinc, uinc, vinc;
+
+	y1 = srcy;
+	y2 = srcy + srcPitchy;
+	u = srcu;
+	v = srcv;
+	yuv = (unsigned char *)(&yuv_cache);
+
+	dstinc = dstPitch - 2*w;
+	yinc = 2*srcPitchy - w;
+	uinc = srcPitchuv - w/2;
+	vinc = srcPitchuv - w/2;
+
+	if (flag_save_stage == 0)
+	{
+		for (h/=2; h; h--) {
+			asm (
+				".set arch=loongson2f\n\t"
+				".set noreorder\n\t"
+				"move $8, %12 \n\t"
+				"1: \n\t"
+				"beqz $8, 2f \n\t"
+				"xor $f0, $f0, $f0 \n\t"
+				"ldc1 $f4, (%0) \n\t"
+				"ldc1 $f8, (%1) \n\t"
+				"pavgb $f4, $f4, $f8 \n\t"
+				"punpcklbh $f2, $f4, $f0 \n\t"
+				"punpckhbh $f4, $f4, $f0 \n\t"
+				"ldc1 $f16, 8(%0) \n\t"
+				"ldc1 $f14, 8(%1) \n\t"
+				"pavgb $f16, $f16, $f14 \n\t"
+				"punpcklbh $f14, $f16, $f0 \n\t"
+				"punpckhbh $f16, $f16, $f0 \n\t"
+
+				"lwc1 $f8, (%2) \n\t"
+				"lwc1 $f12, (%3) \n\t"
+				"punpcklbh $f8, $f8, $f12 \n\t"
+				"punpcklbh $f6, $f0, $f8 \n\t"
+				"punpckhbh $f8, $f0, $f8 \n\t"
+				"lwc1 $f18, 4(%2) \n\t"
+				"lwc1 $f12, 4(%3) \n\t"
+				"punpcklbh $f18, $f18, $f12 \n\t"
+				"punpcklbh $f10, $f0, $f18 \n\t"
+				"punpckhbh $f12, $f0, $f18 \n\t"
+
+				"or $f2, $f2, $f6 \n\t"
+				"or $f4, $f4, $f8 \n\t"
+				"or $f14, $f14, $f10 \n\t"
+				"or $f16, $f16, $f12 \n\t"
+
+				"sdc1 $f2, (%4) \n\t"
+				"sdc1 $f4, 8(%4) \n\t"
+				"sdc1 $f2, (%5) \n\t"
+				"sdc1 $f4, 8(%5) \n\t"
+				"add %0, 16 \n\t"
+				"add %1, 16 \n\t"
+				"add %2, 8 \n\t"
+				"add %3, 8 \n\t"
+				"sdc1 $f14, 0x10(%4) \n\t"
+				"sdc1 $f16, 0x18(%4) \n\t"
+				"sdc1 $f14, 0x10(%5) \n\t"
+				"sdc1 $f16, 0x18(%5) \n\t"
+				"add $8, -1 \n\t"
+				"add %5, 32 \n\t"
+				"b 1b \n\t"
+				"add %4, 32 \n\t"
+				"2: \n\t"
+				".set reorder\n\t"
+				: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst), "=r"(yuv)
+				: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "5"(yuv), "r" (w>>4)
+				: "memory","$8"
+				);
+
+			asm (
+				".set arch=loongson2f\n\t"
+				".set noreorder\n\t"
+				"move $8, %12 \n\t"
+				"1: \n\t"
+				"beqz $8, 2f \n\t"
+				"xor $f0, $f0, $f0 \n\t"
+				"ldc1 $f4, (%0) \n\t"
+				"ldc1 $f8, (%1) \n\t"
+				"pavgb $f4, $f4, $f8 \n\t"
+				"punpcklbh $f2, $f4, $f0 \n\t"
+				"punpckhbh $f4, $f4, $f0 \n\t"
+
+				"lwc1 $f8, (%2) \n\t"
+				"lwc1 $f12, (%3) \n\t"
+				"punpcklbh $f8, $f8, $f12 \n\t"
+				"punpcklbh $f6, $f0, $f8 \n\t"
+				"punpckhbh $f8, $f0, $f8 \n\t"
+
+				"or $f2, $f2, $f6 \n\t"
+				"or $f4, $f2, $f8 \n\t"
+
+				"sdc1 $f2, (%4) \n\t"
+				"sdc1 $f4, 8(%4) \n\t"
+				"sdc1 $f2, (%5) \n\t"
+				"sdc1 $f4, 8(%5) \n\t"
+				"add %0, 8 \n\t"
+				"add %1, 8 \n\t"
+				"add %2, 4 \n\t"
+				"add %3, 4 \n\t"
+				"add $8, -1 \n\t"
+				"add %5, 16 \n\t"
+				"b 1b \n\t"
+				"add %4, 16 \n\t"
+				"2:\n\t"
+				".set reorder\n\t"
+				: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst), "=r"(yuv)
+				: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "5"(yuv), "r" ((w&0xf)/8)
+				: "memory","$8"
+				);
+
+			for (j = (w&7)/2; j; j--)
+			{
+				*dst++ = (*y1++ + *y2++)/2;
+				*dst++ = *u++;
+				*dst++ = (*y1++ + *y2++)/2;
+				*dst++ = *v++;
+			}
+			y1 += yinc;
+			y2 += yinc;
+			u += uinc;
+			v += vinc;
+			dst += dstinc;
+		}
+	}
+	else
+	{
+		for (h/=2; h; h--)
+		{
+			asm (
+				".set arch=loongson2f\n\t"
+				".set noreorder\n\t"
+				"move $8, %12 \n\t"
+				"1: \n\t"
+				"beqz $8, 2f \n\t"
+				"xor $f0, $f0, $f0 \n\t"
+				"ldc1 $f4, (%0) \n\t"
+				"ldc1 $f8, (%1) \n\t"
+				"pavgb $f4, $f4, $f8 \n\t"
+				"punpcklbh $f2, $f4, $f0 \n\t"
+				"punpckhbh $f4, $f4, $f0 \n\t"
+				"ldc1 $f16, 8(%0) \n\t"
+				"ldc1 $f14, 8(%1) \n\t"
+				"pavgb $f16, $f16, $f14 \n\t"
+				"punpcklbh $f14, $f16, $f0 \n\t"
+				"punpckhbh $f16, $f16, $f0 \n\t"
+
+				"lwc1 $f8, (%2) \n\t"
+				"lwc1 $f12, (%3) \n\t"
+				"punpcklbh $f8, $f8, $f12 \n\t"
+				"punpcklbh $f6, $f0, $f8 \n\t"
+				"punpckhbh $f8, $f0, $f8 \n\t"
+				"lwc1 $f18, 4(%2) \n\t"
+				"lwc1 $f12, 4(%3) \n\t"
+				"punpcklbh $f18, $f18, $f12 \n\t"
+				"punpcklbh $f10, $f0, $f18 \n\t"
+				"punpckhbh $f12, $f0, $f18 \n\t"
+
+				"ldc1 $f20, 0x0(%5) \n\t"
+				"ldc1 $f22, 0x8(%5) \n\t"
+				"ldc1 $f24, 0x10(%5) \n\t"
+				"ldc1 $f26, 0x18(%5) \n\t"
+
+				"or $f2, $f2, $f6 \n\t"
+				"or $f4, $f4, $f8 \n\t"
+				"or $f14, $f14, $f10 \n\t"
+				"or $f16, $f16, $f12 \n\t"
+
+				"pavgb $f2, $f2, $f20 \n\t"
+				"pavgb $f4, $f4, $f22 \n\t"
+				"sdc1 $f2, (%4) \n\t"
+				"sdc1 $f4, 8(%4) \n\t"
+				"sdc1 $f2, (%5) \n\t"
+				"sdc1 $f4, 8(%5) \n\t"
+				"add %0, 16 \n\t"
+				"add %1, 16 \n\t"
+				"add %2, 8 \n\t"
+				"add %3, 8 \n\t"
+				"pavgb $f14, $f14, $f24 \n\t"
+				"pavgb $f16, $f16, $f26 \n\t"
+				"sdc1 $f14, 0x10(%4) \n\t"
+				"sdc1 $f16, 0x18(%4) \n\t"
+				"sdc1 $f14, 0x10(%5) \n\t"
+				"sdc1 $f16, 0x18(%5) \n\t"
+				"add $8, -1 \n\t"
+				"add %5, 32 \n\t"
+				"b 1b \n\t"
+				"add %4, 32 \n\t"
+				"2: \n\t"
+				".set reorder\n\t"
+				: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst), "=r"(yuv)
+				: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "5"(yuv), "r" (w>>4)
+				: "memory","$8"
+				);
+
+			asm (
+				".set arch=loongson2f\n\t"
+				".set noreorder\n\t"
+				"move $8, %12 \n\t"
+				"1: \n\t"
+				"beqz $8, 2f \n\t"
+				"xor $f0, $f0, $f0 \n\t"
+				"ldc1 $f4, (%0) \n\t"
+				"ldc1 $f8, (%1) \n\t"
+				"pavgb $f4, $f4, $f8 \n\t"
+				"punpcklbh $f2, $f4, $f0 \n\t"
+				"punpckhbh $f4, $f4, $f0 \n\t"
+
+				"lwc1 $f8, (%2) \n\t"
+				"lwc1 $f12, (%3) \n\t"
+				"punpcklbh $f8, $f8, $f12 \n\t"
+				"punpcklbh $f6, $f0, $f8 \n\t"
+				"punpckhbh $f8, $f0, $f8 \n\t"
+
+				"ldc1 $f10, 0x0(%5) \n\t"
+				"ldc1 $f12, 0x8(%5) \n\t"
+
+				"or $f2, $f2, $f6 \n\t"
+				"or $f4, $f4, $f8 \n\t"
+
+				"pavgb $f2, $f2, $f10 \n\t"
+				"pavgb $f4, $f4, $f12 \n\t"
+				"sdc1 $f2, (%4) \n\t"
+				"sdc1 $f4, 8(%4) \n\t"
+				"sdc1 $f2, (%5) \n\t"
+				"sdc1 $f4, 8(%5) \n\t"
+				"add %0, 8 \n\t"
+				"add %1, 8 \n\t"
+				"add %2, 4 \n\t"
+				"add %3, 4 \n\t"
+				"add $8, -1 \n\t"
+				"add %5, 16 \n\t"
+				"b 1b \n\t"
+				"add %4, 16 \n\t"
+				"2:\n\t"
+				".set reorder\n\t"
+				: "=r" (y1), "=r" (y2), "=r" (u), "=r" (v), "=r" (dst), "=r"(yuv)
+				: "0" (y1), "1" (y2), "2" (u), "3" (v), "4" (dst), "5"(yuv), "r" ((w&0xf)/8)
+				: "memory","$8"
+				);
+
+			for (j = (w&7)/2; j; j--)
+			{
+				*dst++ = (*yuv++ + *y1++ + *y2++)/3;
+				*dst++ = (*yuv++ + *u++)/2;
+				*dst++ = (*yuv++ + *y1++ + *y2++)/3;
+				*dst++ = (*yuv++ + *v++)/2;
+			}
+			y1 += yinc;
+			y2 += yinc;
+			u += uinc;
+			v += vinc;
+			dst += dstinc;
+		}
+	}
+}
+
+static void do_avg(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h)
+{
+	if ((cpu_busy) && !(count % cpu_busy))
+	{
+		pack_loongson_MMX(dst, srcy, srcv, srcu,
+					dstPitch, srcPitchy, srcPitchuv, w, h);
+	}
+	else
+	{
+		if (cpu_busy)
+		{
+			return ;
+		}
+		else
+		{
+			pack_loongson_MMX(dst, srcy, srcv, srcu,
+						dstPitch, srcPitchy, srcPitchuv, w, h);
+		}
+	}
+
+	return;
+}
+
+static void do_halfpack_avg(unsigned char *dst, unsigned char *srcy, unsigned char *srcv, unsigned char *srcu, int dstPitch, int srcPitchy, int srcPitchuv, int w, int h)
+{
+	if ((cpu_busy) && !(count % cpu_busy))
+	{
+		halfpack_loongson_MMX(dst, srcy, srcv, srcu,
+					dstPitch, srcPitchy, srcPitchuv, w, h);
+	}
+	else
+	{
+		if (cpu_busy)
+		{
+			return ;
+		}
+		else
+		{
+			halfpack_loongson_MMX(dst, srcy, srcv, srcu,
+						dstPitch, srcPitchy, srcPitchuv, w, h);
+		}
+	}
+
+	return;
+}
+
+/*
+pack_loongson_MMX,			// MMX yuv planar to yuv packed
+halfpack_loongson_MMX_Avg,	// MMX yuv planar to yuv halfpacked without scale
+halfpack_loongson_MMX,		// Default. MMX yuv planar to yuv halfpacked with scale
+do_avg,						// Skip frames according to the cpu_busy value
+*/
+static fastxv fast_pack[3][3] =
+{
+	{pack_loongson_MMX, halfpack_loongson_MMX_Avg, do_avg}, /* cpu busy */
+	{pack_loongson_MMX, pack_loongson_MMX, pack_loongson_MMX}, /* cpu idle */
+	{halfpack_loongson_MMX, halfpack_loongson_MMX, do_halfpack_avg},
+};
+
+/*
+ *    1 |\/|3
+ *    2 |/\|4
+ *
+ *   + + + +  old     + + + +  new    + + + +  old    + + + +  new 
+ *   - - - -  new    - - - -  old      - - - -  new   - - - -  old 
+ *   + + + +  old     + + + +  new    + + + +  old    + + + +  new 
+ *   - - - -  new    - - - -  old      - - - -  new   - - - -  old
+ *   + + + +  old     + + + +  new    + + + +  old    + + + +  new
+ *   - - - -  new    - - - -  old      - - - -  new   - - - -  old
+ */
+static void xf86XVCopyYUV12ToPackedFast(const void *srcy, const void *srcv, const void *srcu, void *dst, int srcPitchy, int srcPitchuv, int dstPitch, int h, int w)
+{
+    unsigned char *y, *d;
+    static int step = 0;
+
+    y = (unsigned char*)srcy;
+	d = (unsigned char*)dst;
+
+    if (halfpack)
+	{
+	    fast_pack[2][2](d, y, (unsigned char*)srcv, (unsigned char*)srcu,
+					dstPitch, srcPitchy, srcPitchuv, w, 2*h);
+	    return;
+    }
+
+    if (!cpu_idle && fast_level)
+	{
+	    switch (step)
+		{
+	    case 0:
+		    step++;
+		    dstPitch *= 2;
+		    break;
+	    case 1:
+		    y += srcPitchy;
+		    d += dstPitch;
+		    dstPitch *= 2;
+		    step = (fast_level == 1) ? 0 : (step + 1);
+		    break;
+	    }
+    }
+
+    fast_pack[cpu_idle][fast_level](d, y,
+				(unsigned char*)srcv, (unsigned char*)srcu,
+				dstPitch, srcPitchy, srcPitchuv, w, h);
+}
+
+static void xf86XVCopyPackedFast(
+    const void *src,
+    void *dst,
+    int srcPitch,
+    int dstPitch,
+    int h,
+    int w)
+{
+    w <<= 1;
+    while (--h >= 0)
+	{
+		memcpy(dst, src, w);
+		src = (const CARD8 *)src + srcPitch;
+		dst = (CARD8 *)dst + dstPitch;
+    }
+}
+
 static int
 SMI_PutImage(
 	ScrnInfoPtr		pScrn,
@@ -1498,6 +2164,33 @@ SMI_PutImage(
 
     ENTER();
 
+	if (pSmi->FastXv && fast_level && count++ >= FASTXV_COUNT)
+	{
+		get_jiffy_counts(&jif);
+		if (jif.idle - pre_jif.idle > 0)
+		{
+			cpu_idle = 1;
+			cpu_busy = 0;
+		}
+		else
+		{
+			flag_save_stage = cpu_idle ? 0 : 1;
+			cpu_idle = 0;
+			cpu_busy++;
+		}
+		count = 0;
+		pre_jif = jif;
+	}
+	if ((src_w + src_h) > 1000)
+	{
+		halfpack = 1;
+		src_h /= 2;
+	}
+	else
+	{
+		halfpack = 0;
+	}
+
     x1 = src_x;
     y1 = src_y;
     x2 = src_x + src_w;
@@ -1593,10 +2286,20 @@ SMI_PutImage(
 		offset3 = tmp;
 	    }
 	    nLines = ((((y2 + 0xffff) >> 16) + 1) & ~1) - top;
-	    xf86XVCopyYUV12ToPacked(buf + (top * srcPitch) + (left >> 1), 
-				    buf + offset2, buf + offset3, dstStart,
-				    srcPitch, srcPitch2, dstPitch, nLines,
-				    nPixels);
+		if (pSmi->FastXv)
+		{
+			xf86XVCopyYUV12ToPackedFast(buf + (top * srcPitch) + (left >> 1), 
+						buf + offset2, buf + offset3, dstStart,
+						srcPitch, srcPitch2, dstPitch, nLines,
+						nPixels);
+		}
+		else
+		{
+			xf86XVCopyYUV12ToPacked(buf + (top * srcPitch) + (left >> 1), 
+						buf + offset2, buf + offset3, dstStart,
+						srcPitch, srcPitch2, dstPitch, nLines,
+						nPixels);
+		}
 	}
 	break;
     case FOURCC_UYVY:
@@ -1604,7 +2307,14 @@ SMI_PutImage(
     default:
 	buf += (top * srcPitch) + left;
 	nLines = ((y2 + 0xffff) >> 16) - top;
-	xf86XVCopyPacked(buf, dstStart, srcPitch, dstPitch, nLines, nPixels);
+	if (pSmi->FastXv)
+	{
+		xf86XVCopyPackedFast(buf, dstStart, srcPitch, dstPitch, nLines, nPixels);
+	}
+	else
+	{
+		xf86XVCopyPacked(buf, dstStart, srcPitch, dstPitch, nLines, nPixels);
+	}
         break;
     }
 
