
     thanks wgge7270@gmail.com  
     http://dev.lemote.com/cgit/Pixman.Loongson.git/


                     liushiwei@gmail.com  20121207

diff -uNra pixman-0.26.0.orig/configure.ac pixman-0.26.0/configure.ac
--- pixman-0.26.0.orig/configure.ac	2012-05-27 03:50:44.000000000 +0800
+++ pixman-0.26.0/configure.ac	2012-12-07 13:19:07.357995632 +0800
@@ -324,6 +324,56 @@
 AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
 
 dnl ===========================================================================
+dnl Check for Loongson SIMD
+
+have_loongson_SIMD=no
+xserver_save_CFLAGS=$CFLAGS
+AC_MSG_CHECKING(checking for loongson CPU from /proc/cpuinfo )
+
+if test -n "`cat /proc/cpuinfo | grep Loongson | grep V0.3 `"; then
+   LS_CFLAGS="-march=loongson2f"
+   CFLAGS="$LS_CFLAGS $CFLAGS"
+fi
+if test -n "`cat /proc/cpuinfo | grep Loongson | grep V0.2 `"; then
+   LS_CFLAGS="-march=loongson2e"
+   CFLAGS="$LS_CFLAGS $CFLAGS"
+fi
+   AC_MSG_RESULT($LS_CFLAGS)
+
+AC_MSG_CHECKING(whether to use Loongson SIMD )
+
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 4.4 for Loongson SIMD compilation"
+#endif
+int main () {
+    /* Test with a loongson SIMD instruction. */
+	asm volatile ( "and \$f0, \$f0, \$f0 \n\t"  : : :  "memory" );
+    return 0;
+}], have_loongson_SIMD=yes)
+
+
+AC_ARG_ENABLE(loongson,
+   [AC_HELP_STRING([--disable-loongson],
+                   [disable Loongson fast paths])],
+   [enable_loongson=$enableval], [enable_loongson=auto])
+
+if test $enable_loongson = no ; then
+   have_loongson_SIMD=disabled
+fi
+
+if test $have_loongson_SIMD = yes ; then
+   AC_DEFINE(USE_LOONGSON_SIMD, 1, [use Loongson SIMD])
+fi
+
+AC_MSG_RESULT($have_loongson_SIMD)
+if test $enable_loongson = yes && test $have_loongson_SIMD = no ; then
+   AC_MSG_ERROR([Loongson SIMD not detected])
+fi
+
+AM_CONDITIONAL(USE_LOONGSON_SIMD, test $have_loongson_SIMD = yes)
+
+dnl ===========================================================================
 dnl Check for MMX
 
 if test "x$MMX_CFLAGS" = "x" ; then
@@ -968,6 +1018,17 @@
 
 dnl ==================
 dnl libpng
+
+PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no)
+
+if test x$have_libpng = xyes; then
+    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
+fi
+
+AC_SUBST(HAVE_LIBPNG)
+
+dnl ==================
+dnl libpng
 
 AC_ARG_ENABLE(libpng, AS_HELP_STRING([--enable-libpng], [Build support for libpng (default: auto)]),
                       [have_libpng=$enableval], [have_libpng=auto])
diff -uNra pixman-0.26.0.orig/pixman/Makefile.am pixman-0.26.0/pixman/Makefile.am
--- pixman-0.26.0.orig/pixman/Makefile.am	2012-05-15 01:40:58.000000000 +0800
+++ pixman-0.26.0/pixman/Makefile.am	2012-12-07 13:19:07.361995974 +0800
@@ -21,6 +21,19 @@
 
 DISTCLEANFILES = $(BUILT_SOURCES)
 
+# loongson code
+if USE_LOONGSON_SIMD
+noinst_LTLIBRARIES += libpixman-loongson-simd.la
+libpixman_loongson_simd_la_SOURCES = \
+	pixman-loongson-simd.c
+libpixman_loongson_simd_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
+libpixman_loongson_simd_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-loongson-simd.la
+
+ASM_CFLAGS_ls=$(LS_CFLAGS)
+endif
+
 # mmx code
 if USE_X86_MMX
 noinst_LTLIBRARIES += libpixman-mmx.la
diff -uNra pixman-0.26.0.orig/pixman/pixman-cpu.c pixman-0.26.0/pixman/pixman-cpu.c
--- pixman-0.26.0.orig/pixman/pixman-cpu.c	2012-05-15 01:40:58.000000000 +0800
+++ pixman-0.26.0/pixman/pixman-cpu.c	2012-12-07 13:19:07.365996292 +0800
@@ -780,6 +780,10 @@
 	imp = _pixman_implementation_create_sse2 (imp);
 #endif
 
+#ifdef USE_LOONGSON_SIMD
+	imp = _pixman_implementation_create_ls (imp);
+#endif
+
 #ifdef USE_ARM_SIMD
     if (!disabled ("arm-simd") && pixman_have_arm_simd ())
 	imp = _pixman_implementation_create_arm_simd (imp);
diff -uNra pixman-0.26.0.orig/pixman/pixman-loongson-simd.c pixman-0.26.0/pixman/pixman-loongson-simd.c
--- pixman-0.26.0.orig/pixman/pixman-loongson-simd.c	1970-01-01 08:00:00.000000000 +0800
+++ pixman-0.26.0/pixman/pixman-loongson-simd.c	2012-12-07 13:19:07.401999435 +0800
@@ -0,0 +1,2696 @@
+/*
+ * Copyright 2011 WG Ge (wgge7270@gmail.com).
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Based on pixman-mmx.c
+ * Implemented for loongson 2E/2F only.
+ * Free software based on GPL v2 licence.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+/* Start of primitive macros */
+
+/* Loongson SIMD register usage protocal
+ *	return result: f8 
+ *	tmp immediate f12
+ *	tmp register in primtive macros f0, f2, and f14 
+ *	tmp register in combiner & compositor functions f4, f6, f10, f16, and f18  
+ *	in which f4 and f6 are often used to hold preloaded constant values
+ *	load primitives specify destination FPR register
+ *	store primitives specify source FPR register
+ *  negate primitive specifies the src and dst registers
+ * Warning: Don't use return result $f8 as input, it might be overwritten
+ */
+
+
+/* One clobber for all. Lazy but it actually fits almost all the functions defined */
+#define clobber "$8","$f0","$f2","$f4","$f6","$f8","$f10",\
+	"$f12","$f14","$f16","$f18","$f20","$f22","$f24","$f26","$f28"
+
+/* dli not recognized by march=loongson2f, use set mips3/set mips0 to work around */
+#define  DMTC1_IMM(regc1,imm) \
+	".set mips3 \n\t" \
+	"dli $8, "#imm" \n\t" \
+	".set mips0 \n\t" \
+	"dmtc1 $8, "#regc1" \n\t"
+
+#define  MTC1_IMM(regc1,imm) \
+	"li $8, "#imm" \n\t" \
+	"dmtc1 $8, "#regc1" \n\t"
+
+#define DEF_CONST() \
+	__asm__ volatile ( \
+	"xor $f20, $f20, $f20 \n\t" \
+	DMTC1_IMM($f22, 0x00ff00ff00ff00ff) \
+	MTC1_IMM($f24, 0xff) \
+	DMTC1_IMM($f26, 0x0080008000800080)\
+	:::clobber )
+
+#define save_to(reg1)  "mov.d "#reg1" ,$f8 \n\t"
+#define zero(reg1)  "xor "#reg1","#reg1","#reg1" \n\t"
+
+#define load32(sp,reg1) \
+		"lwc1 "#reg1", "#sp" \n\t" 
+
+#define load32r(sp,reg1)  \
+	"dmtc1 "#sp", "#reg1" \n\t"
+
+#define load64(sp,reg1) \
+		"ldc1 "#reg1", "#sp" \n\t" 
+
+#define store32(reg1,sp) \
+		"swc1 "#reg1", "#sp" \n\t"
+
+#define store32r(reg1,sp) \
+		"dmfc1 "#sp", "#reg1" \n\t"
+
+#define store64(reg1,sp) \
+		"sdc1 "#reg1", "#sp" \n\t"
+
+#define load8888(sp,reg1) \
+	load32(sp,reg1) \
+	"punpcklbh "#reg1", "#reg1", $f20 \n\t" 
+
+#define load8888r(sp,reg1) \
+	load32r(sp,reg1) \
+	"punpcklbh "#reg1", "#reg1", $f20 \n\t" 
+
+#define store8888(reg1,sp) \
+	"packushb "#reg1", "#reg1", "#reg1" \n\t" \
+	store32(reg1,sp)
+
+#define store8888r(reg1,sp) \
+	"packushb "#reg1", "#reg1", "#reg1" \n\t" \
+	store32r(reg1,sp)
+
+#define pack8888(reg1,reg2) 	\
+	"packushb $f8, "#reg1","#reg2" \n\t"
+
+#define unpack8888(reg1,reg2) 	\
+	"punpcklbh $f8, "#reg1","#reg2" \n\t"
+
+#define negate(sreg,dreg) \
+	"xor "#dreg", "#sreg", $f22 \n\t"
+
+#define pix_add(reg1,reg2) \
+	"paddusb $f8, "#reg1", "#reg2" \n\t"
+
+#define pix_multiply(reg1,reg2) \
+	"pmullh $f14, "#reg1","#reg2" \n\t " \
+ 	MTC1_IMM($f12, 8) \
+	"paddush $f14, $f14, $f26 \n\t "\
+	"psrlh $f8, $f14, $f12\n\t" \
+	"paddush $f14, $f14, $f8 \n\t" \
+	"psrlh $f8, $f14, $f12 \n\t" \
+
+#define pix_add_mul(reg1,reg2,reg3,reg4) \
+	pix_multiply(reg1,reg2) \
+	"mov.d $f2, $f8 \n\t" \
+	pix_multiply(reg3,reg4) \
+	pix_add($f2,$f8)	
+
+#define expand_alpha(sreg,dreg) \
+		"pshufh "#dreg", "#sreg", $f24 \n\t"
+
+#define expand_alpha_rev(sreg,dreg)\
+		"pshufh "#dreg", "#sreg", $f20 \n\t"
+
+#define expand8888(reg1,pos) expand8888_##pos(reg1)
+
+#define expand8888_0(reg1) \
+	"punpcklbh $f8, "#reg1", $f20 \n\t" 
+
+#define expand8888_1(reg1) \
+	"punpckhbh $f8, "#reg1", $f20 \n\t" 
+
+#define expandx888(reg1,pos) \
+	expand8888(reg1,pos) \
+	DMTC1_IMM($f12, 0x00ff000000000000) \
+	"or $f8, $f8, $f12 \n\t"
+
+#define invert_colors(reg1)  \
+	MTC1_IMM($f12, 0b11000110)\
+	"pshufh $f8, "#reg1", $f12 \n\t"
+
+#define over(reg1,reg2,reg3) \
+	negate(reg2,$f8) \
+	pix_multiply(reg3, $f8)\
+	pix_add(reg1, $f8) 
+
+
+#define over_rev_non_pre(reg1,reg2) \
+	expand_alpha(reg1,$f2) \
+	DMTC1_IMM($f12,0x00ff000000000000) \
+	"or $f0, $f2, $f12 \n\t" \
+	invert_colors(reg1) \
+	pix_multiply($f8,$f0) \
+	save_to($f0) \
+	over($f0, $f2, reg2)
+
+#define in(reg1,reg2) pix_multiply(reg1,reg2) 
+
+#define in_over_full_src_alpha(reg1,reg2,reg3) \
+	DMTC1_IMM($f12,0x00ff000000000000) \
+	"or $f2, "#reg1", $f12 \n\t" \
+	in($f2,reg2) \
+	save_to($f2) \
+	over($f2,reg2,reg3)
+
+#define in_over(reg1,reg2,reg3,reg4) \
+	in(reg1,reg3) \
+	"mov.d $f2, $f8 \n\t" \
+	pix_multiply(reg2,reg3) \
+	"mov.d $f0, $f8 \n\t" \
+	over($f2,$f0,reg4)
+
+/* End of primitive macros */
+
+#if 0
+pixman_bool_t
+pixman_fill_ls (uint32_t *bits,
+                 int       stride,
+                 int       bpp,
+                 int       x,
+                 int       y,
+                 int       width,
+                 int       height,
+                 uint32_t xor)
+{
+    uint64_t fill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
+
+
+    if (bpp != 16 && bpp != 32 && bpp != 8)
+	return FALSE;
+
+    if (bpp == 8)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+        xor = (xor & 0xff) * 0x01010101;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+        xor = (xor & 0xffff) * 0x00010001;
+    }
+    else
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+
+    fill = ((uint64_t)xor << 32) | xor;
+
+	__asm__ volatile (
+	"ldc1 $f24, %0 \n\t"
+	::"m"(fill):"$f24"
+	);
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+
+	byte_line += stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	"dmfc1 $8, $f24 \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $8 ,   8(%0) \n\t"	
+	"sd $8 ,   16(%0) \n\t"
+	"sd $8 ,   24(%0) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $8 ,   40(%0) \n\t"	       
+	"sd $8 ,   48(%0) \n\t"        
+	"sd $8 ,   56(%0) \n\t"        
+	::"r"(d):"$8","memory","$f24"
+	);
+	    w -= 64;
+	    d += 64;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+	while (w >= 2)
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+	while (w >= 1)
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+    }
+    return TRUE;
+}
+
+static pixman_bool_t
+pixman_blt_ls (uint32_t *src_bits,
+                uint32_t *dst_bits,
+                int       src_stride,
+                int       dst_stride,
+                int       src_bpp,
+                int       dst_bpp,
+                int       src_x,
+                int       src_y,
+                int       dest_x,
+                int       dest_y,
+                int       width,
+                int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+ if ((unsigned long)s & 7)
+{
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	"uld $8 ,   (%1) \n\t"
+	"uld $9 ,  8(%1) \n\t"
+	"uld $10, 16(%1) \n\t"
+	"uld $11, 24(%1) \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $9 ,   8(%0) \n\t"	
+	"sd $10,   16(%0) \n\t"
+	"sd $11,   24(%0) \n\t"
+
+	"uld $8 ,   32(%1) \n\t"
+	"uld $9 ,   40(%1) \n\t"
+	"uld $10,   48(%1) \n\t"
+	"uld $11,   56(%1) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $9 ,   40(%0) \n\t"	       
+	"sd $10,   48(%0) \n\t"        
+	"sd $11,   56(%0) \n\t"        
+	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
+	);
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+}
+else
+{
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	"ld $8 ,   (%1) \n\t"
+	"ld $9 ,  8(%1) \n\t"
+	"ld $10, 16(%1) \n\t"
+	"ld $11, 24(%1) \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $9 ,   8(%0) \n\t"	
+	"sd $10,   16(%0) \n\t"
+	"sd $11,   24(%0) \n\t"
+
+	"ld $8 ,   32(%1) \n\t"
+	"ld $9 ,   40(%1) \n\t"
+	"ld $10,   48(%1) \n\t"
+	"ld $11,   56(%1) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $9 ,   40(%0) \n\t"	       
+	"sd $10,   48(%0) \n\t"        
+	"sd $11,   56(%0) \n\t"        
+	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
+	);
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+}	
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+    return TRUE;
+}
+
+
+static pixman_bool_t
+ls_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dest_x,
+         int                      dest_y,
+         int                      width,
+         int                      height)
+{
+    if (!pixman_blt_ls (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+ls_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t xor)
+{
+    if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor))
+    {
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static void
+ls_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dest_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    pixman_blt_ls (src_image->bits.bits,
+                    dest_image->bits.bits,
+                    src_image->bits.rowstride,
+                    dest_image->bits.rowstride,
+                    PIXMAN_FORMAT_BPP (src_image->bits.format),
+                    PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                    src_x, src_y, dest_x, dest_y, width, height);
+}
+#endif
+static void
+ls_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint32_t m;
+    uint32_t s;
+    int32_t w;
+
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+          __asm__ volatile (
+          load8888(%0,$f16) 
+          load8888(%1,$f18) 
+          load8888(%2,$f4) 
+          expand_alpha($f18,$f6)
+          expand_alpha_rev($f4,$f10)
+          in_over($f18,$f6,$f10,$f16)
+          store8888($f8,%0)
+          :"+m"(*dst):"m"(s),"m"(m):clobber
+          );
+
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_8888_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    int dst_stride, src_stride;
+    uint32_t a;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    a = *src >> 24;
+
+	    if (a == 0xff)
+	    {
+		*dst = *src;
+	    }
+	    else if (*src)
+	    {
+		__asm__ volatile (
+		load8888(%1,$f4) 
+		load8888(%0,$f16)
+		expand_alpha($f4,$f6)
+		over($f4,$f6,$f16)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+	    }
+		dst++;
+		src++;	
+
+	}
+    }
+}
+
+
+static void
+ls_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid ( imp, mask_image, dest_image->bits.format);
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+	DEF_CONST();
+	__asm__ volatile (
+	load8888(%0,$f4)
+	::"m"(mask):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		__asm__ volatile (
+		load8888(%1,$f16) 
+		load8888(%0,$f18) 
+		expand_alpha($f16,$f10)
+		in_over($f16,$f10,$f4,$f18)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+#if 0
+static void
+ls_composite_over_8888_n_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid ( imp, mask_image, dest_image->bits.format);
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+
+	__asm__ volatile (
+	load8888(%0,$f4)
+	::"m"(mask):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		d = CONVERT_0565_TO_0888 (*dst);
+
+		__asm__ volatile (
+		load8888(%1,$f16) 
+		load8888(%0,$f18) 
+		expand_alpha($f16,$f10)
+		in_over($f16,$f10,$f4,$f18)
+		store8888($f8,%0)
+		:"+m"(d):"m"(*src):clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+#endif
+static void
+ls_composite_over_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w)
+	{
+	__asm__ volatile (
+	load8888(%0,$f10)
+	over($f4,$f6,$f10)
+	store8888($f8,%0)
+	:"+m"(*dst)::clobber
+	);
+
+	    w--;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint32_t *q = (uint32_t *)dst_line;
+
+	while (twidth)
+	{
+	    if (*p)
+	    {
+		__asm__ volatile (
+		load8888(%0,$f10)
+		load8888(%1,$f16)
+		in_over($f4,$f6,$f16,$f10)
+		store8888($f8,%0)
+		:"+m"(*q):"m"(*p):clobber
+		);
+	    }
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+    }
+}
+
+
+static void
+ls_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    uint32_t m = *mask;
+
+	    if (m)
+	    {
+		__asm__ volatile (
+		load8888(%0,$f16)
+		load32(%1,$f18)
+		expand_alpha_rev($f18,$f10)
+		in_over($f4,$f6,$f10,$f16)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(m):clobber
+		);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid ( imp, mask_image, dest_image->bits.format);
+
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+	__asm__ volatile (
+	load8888(%0,$f4)
+	::"m"(mask):clobber
+	);
+
+	__asm__ volatile (
+	DMTC1_IMM($f6,0x00ff00ff00ff00ff)
+	:::clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		uint32_t src_tmp = *src | 0xff000000;
+		__asm__ volatile (
+		load8888(%1,$f16)
+		load8888(%0,$f18)		
+		in_over($f16,$f6,$f4,$f18)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(src_tmp):clobber
+		);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+
+static void
+ls_composite_over_8888_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint32_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = CONVERT_0565_TO_0888 (*dst);
+
+		    __asm__ volatile (
+		    load8888(%1,$f4) 
+		    load8888(%0,$f16)
+		    expand_alpha($f4,$f6) 
+		    over($f4,$f6,$f16)
+		    store8888($f8,%0)
+		    :"+m"(d):"m"(s):clobber
+		    );
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t d;
+    uint16_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w)
+	{
+		d = CONVERT_0565_TO_0888 (*dst);
+
+		__asm__ volatile (
+		load8888(%0,$f16)
+		over($f4,$f6,$f16)
+		store8888($f8,%0)
+		:"+m"(d)::clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    w--;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, m, d;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    m = *mask;
+	    d = *dst;
+
+	    if (m)
+	    {
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		load8888(%0,$f16)
+		load32(%1,$f18)
+		expand_alpha_rev($f18,$f10)
+		in_over($f4,$f6,$f10,$f16)
+		store8888($f8,%0)
+		:"+m"(d):"m"(m):clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, m, d;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint16_t *q = (uint16_t *)dst_line;
+
+	while (twidth)
+	{
+	    m = *(uint32_t *)p;
+	    d = *q;
+
+	    if (m)
+	    {
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		load8888(%0,$f16)
+		load8888(%1,$f18)
+		in_over($f4,$f6,$f18,$f16)
+		store8888($f8,%0)
+		:"+m"(d):"m"(m):clobber
+		);
+
+		*q = CONVERT_8888_TO_0565 (d);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+    }
+}
+static void
+ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		over_rev_non_pre($f18,$f16)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+static void
+ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, d;
+    int dst_stride, src_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		d = CONVERT_0565_TO_0888 (*dst);
+
+		__asm__ volatile (
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		over_rev_non_pre($f16,$f4)
+		store8888($f8,%0)
+		:"+m"(d):"m"(*src):clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+
+static void
+ls_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst, m;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+    {
+	(*imp->fill) (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+			 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	                 dest_x, dest_y, width, height, 0);
+	return;
+    }
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+
+	);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    m = *mask;
+	    if (m)
+	    {
+		__asm__ volatile (
+		load32(%1,$f16)
+		expand_alpha_rev($f16,$f10)
+		in($f4,$f10)
+		store8888($f8,%0)
+		:"=m"(*dst):"m"(m):clobber
+		);
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_add_8888_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+	DEF_CONST();
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (((unsigned long)dst & 7)||((unsigned long)src & 7)) )
+	{
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		pix_add($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 2)
+	{
+
+		__asm__ volatile (
+		load64(%1,$f18) 
+		load64(%0,$f16) 
+		pix_add($f16,$f18)
+		store64($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+
+	    dst += 2;
+	    src += 2;
+	    w -= 2;
+	}
+
+	while (w)
+	{
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		pix_add($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+ls_composite_add_8_8 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (((unsigned long)dst & 7)||((unsigned long)src & 7)) )
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+		__asm__ volatile (
+		load64(%1,$f18) 
+		load64(%0,$f16) 
+		pix_add($f16,$f18)
+		store64($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+	    dst += 8;
+	    src += 8;
+	    w -= 8;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+}
+
+
+static void
+ls_composite_add_n_8_8 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    if (src == 0)
+	return;
+	DEF_CONST();
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	if ((((unsigned long)mask & 3) == 0) &&
+	    (((unsigned long)dst  & 3) == 0))
+	{
+	    while (w >= 4)
+	    {
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		in($f6,$f18)
+		pix_add($f16,$f8)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*mask):clobber
+		);
+
+		w -= 4;
+		dst += 4;
+		mask += 4;
+	    }
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+
+}
+
+static void
+ls_composite_in_8_8 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+	DEF_CONST();
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	if ((((unsigned long)dst & 3) == 0) &&
+	    (((unsigned long)src & 3) == 0))
+	{
+	    while (w >= 4)
+	    {
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		in($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*src):clobber
+		);
+
+		w -= 4;
+		dst += 4;
+		src += 4;
+	    }
+	}
+
+	while (w--)
+	{
+	    uint8_t s, d;
+	    uint16_t tmp;
+
+	    s = *src;
+	    d = *dst;
+
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+ls_composite_in_n_8_8 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid ( imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+	DEF_CONST();
+	__asm__ volatile (
+	load8888(%0,$f4)
+	expand_alpha($f4,$f6)
+	::"m"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	if ((((unsigned long)dst & 3) == 0) &&
+	    (((unsigned long)mask & 3) == 0))
+	{
+	    while (w >= 4)
+	    {
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		in($f6,$f18)
+		in($f8,$f16)
+		store8888($f8,%0)
+		:"+m"(*dst):"m"(*mask):clobber
+		);
+
+		dst += 4;
+		mask += 4;
+		w -= 4;
+	    }
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
+
+	    *dst++ = d;
+	}
+    }
+
+}
+
+static force_inline uint32_t
+combine (const uint32_t *src, const uint32_t *mask)
+{
+    uint32_t ssrc = *src;
+
+    if (mask)
+    {
+		DEF_CONST();
+		__asm__ volatile (
+		load8888(%1,$f18) 
+		load8888(%0,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(ssrc):"m"(*mask):clobber
+		);
+    }
+    return ssrc;
+}
+
+static void
+ls_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+    while (dest < end)
+    {
+	uint32_t s = combine (src, mask);
+	uint32_t d = *dest;
+
+	__asm__ volatile (
+	load8888(%1,$f18) 
+	load8888(%0,$f16) 
+	:"+m"(d):"m"(s):clobber
+	);
+
+	uint32_t sa = s >> 24;
+	uint32_t da = ~d >> 24;
+
+	if (sa > da)
+	{
+		uint32_t dds =  DIV_UN8 (da, sa) << 24;
+		__asm__ volatile (
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4)
+		pix_multiply($f18,$f4)
+		save_to($f18)
+		::"m"(dds):clobber
+		);
+	}
+	__asm__ volatile (
+	pix_add($f16,$f18) 
+	store8888($f8,%0)
+	:"=m"(*dest)::clobber
+	);
+
+	++src;
+	++dest;
+	if (mask)
+	    mask++;
+    }
+}
+
+static void
+ls_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+
+    if (mask) 
+    {
+	    while (dest < end)
+	    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4)
+		negate($f4,$f4)
+		pix_multiply($f16,$f4)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+
+		mask++;
+		++dest;
+		++src;
+    	    }
+	return;
+    }else {
+
+	    while (dest < end)
+	    {
+
+		__asm__ volatile (
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4)
+		negate($f4,$f4)
+		pix_multiply($f16,$f4)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);		
+		
+	++dest;
+	++src;
+	}
+    }
+}
+
+static void
+ls_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask)
+	{
+
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f16)
+		negate($f16,$f16)
+		pix_multiply($f16,$f4)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+
+		mask++;
+	++dest;
+	++src;
+
+    }
+	}else{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f16)
+		negate($f16,$f16)
+		pix_multiply($f16,$f4)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);	
+
+	++dest;
+	++src;
+	}
+    }
+}
+
+static void
+ls_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f6)
+		negate($f6,$f6)
+		pix_multiply($f16,$f18)
+		save_to($f16) 
+		pix_multiply($f16,$f6)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f10)
+		pix_multiply($f18,$f10)
+		save_to($f18) 
+		negate($f18,$f18)
+		pix_multiply($f4,$f18)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask)
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		expand_alpha($f4,$f10)
+		negate($f6,$f6)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+
+	++dest;
+	++src;
+		mask++;
+	}
+	}else {
+
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		expand_alpha($f4,$f10)
+		negate($f6,$f6)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);
+
+	++dest;
+	++src;
+     }
+    }
+}
+
+static void
+ls_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end;
+	DEF_CONST();
+    end = dest + width;
+
+	if (mask){
+
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		expand_alpha($f4,$f10)
+		negate($f10,$f10)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++dest;
+	++src;
+		mask++;
+	}
+	}else{
+
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		expand_alpha($f4,$f10)
+		negate($f10,$f10)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);	
+
+	++dest;
+	++src;
+	}
+    }
+}
+
+static void
+ls_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f6) 
+		expand_alpha($f16,$f10) 
+		pix_multiply($f16,$f18)
+		save_to($f16)
+		pix_multiply($f18,$f10)
+		save_to($f18)
+		negate($f18,$f18)
+		pix_add_mul($f4,$f18,$f16,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f6)
+		expand_alpha($f16,$f10)
+		pix_multiply($f16,$f18)
+		save_to($f16) 
+		pix_multiply($f18,$f10)
+		save_to($f18) 
+		negate($f6,$f6)
+		pix_add_mul($f4,$f18,$f16,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask) 
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6) 
+		expand_alpha($f4,$f10) 
+		negate($f6,$f6)
+		negate($f10,$f10)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+		mask++;
+	++dest;
+	++src;
+      }
+	}else{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6) 
+		expand_alpha($f4,$f10) 
+		negate($f6,$f6)
+		negate($f10,$f10)
+		pix_add_mul($f16,$f10,$f4,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);
+
+	++dest;
+	++src;
+    }
+    }
+}
+
+static void
+ls_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f6) 
+		expand_alpha($f16,$f10) 
+		pix_multiply($f16,$f18)
+		save_to($f16) 
+		pix_multiply($f18,$f10)
+		save_to($f18) 
+		negate($f6,$f6)
+		negate($f18,$f18)
+		pix_add_mul($f4,$f18,$f16,$f6)		
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask) 
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		pix_multiply($f4,$f6)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+		mask++;
+	++dest;
+	++src;
+     }
+	} else {
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f6)
+		pix_multiply($f4,$f6)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber		
+		);
+	
+	++dest;
+	++src;
+    }
+    }
+}
+
+static void
+ls_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f16,$f16) 
+		pix_multiply($f18,$f16)
+		save_to($f6)
+		pix_multiply($f4,$f6)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask) 
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4)
+		pix_multiply($f16,$f4)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+		mask++;
+	++dest;
+	++src;
+    }
+	} else {
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4)
+		pix_multiply($f16,$f4)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);	
+	
+	++dest;
+	++src;
+    }
+   	}
+}
+
+static void
+ls_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		load8888(%0,$f4) 
+		expand_alpha($f4,$f4) 
+		pix_multiply($f16,$f18)
+		save_to($f6)
+		pix_multiply($f6,$f4)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+ }
+static void
+ls_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18) 
+		load8888(%1,$f16) 
+		pix_multiply($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++mask;
+	++dest;
+    }
+
+}
+
+
+static void 
+ls_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+    while (dest < end)
+    {
+
+	uint32_t ssrc = combine (src, mask);
+	uint32_t a = ssrc >> 24;
+
+	if (a == 0xff)
+	{
+	    *dest = ssrc;
+	}
+	else if (ssrc)
+	{
+
+		__asm__ volatile (
+		load8888(%1,$f16)
+
+		expand_alpha($f16,$f4) 
+		load8888(%0,$f6) 
+		over($f16,$f4,$f6)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(ssrc):clobber
+		);
+	}
+
+	++dest;
+	++src;
+	if (mask)
+	    ++mask;
+    }
+}
+
+static void
+ls_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask)
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f6) 
+		expand_alpha($f6,$f10)
+		over($f6,$f10,$f16)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+		mask++;
+	++dest;
+	++src;
+    }
+	}else{
+
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f6) 
+		expand_alpha($f6,$f10)
+		over($f6,$f10,$f16)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);	
+	++dest;
+	++src;
+    }
+    }
+}
+
+
+static void
+ls_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%0,$f16) 
+		load8888(%1,$f18) 
+		load8888(%2,$f4) 
+		expand_alpha($f18,$f6) 
+		in_over($f18,$f6,$f4,$f16)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+
+}
+
+static void
+ls_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%0,$f16) 
+		load8888(%1,$f18) 
+		load8888(%2,$f4) 
+		in($f18,$f4)
+		save_to($f18)
+		expand_alpha($f16,$f10) 
+		over($f16,$f10,$f18)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+
+}
+
+static void
+ls_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+	DEF_CONST();
+	if (mask)
+	{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%2,$f18)  
+		load8888(%1,$f16) 
+		expand_alpha($f18,$f18)
+		pix_multiply($f16,$f18)
+		save_to ($f16)
+
+		load8888(%0,$f18) 
+		pix_add($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+		mask++;
+	++dest;
+	++src;
+    }
+	}else{
+    while (dest < end)
+    {
+		__asm__ volatile (
+		load8888(%1,$f16) 
+
+		load8888(%0,$f18) 
+		pix_add($f16,$f18)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src):clobber
+		);	
+	++dest;
+	++src;
+	}
+    }
+}
+
+static void
+ls_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+	DEF_CONST();
+    while (src < end)
+    {
+		__asm__ volatile (
+		load8888(%0,$f16) 
+		load8888(%1,$f18) 
+		load8888(%2,$f4) 
+		pix_multiply($f18,$f4)
+		save_to($f18)
+		pix_add($f18,$f16)
+		store8888($f8,%0)
+		:"+m"(*dest):"m"(*src),"m"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static const pixman_fast_path_t ls_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, ls_composite_over_x888_8_8888    ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   ls_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   ls_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   ls_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   ls_composite_over_pixbuf_0565    ),
+
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   ls_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   ls_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   ls_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   ls_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   ls_composite_over_n_0565         ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, ls_composite_over_x888_n_8888    ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, ls_composite_over_pixbuf_8888    ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, ls_composite_over_8888_n_8888    ),
+#if 0
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    r5g6b5, ls_composite_over_8888_n_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    b5g6r5, ls_composite_over_8888_n_0565    ),
+#endif
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, ls_composite_over_n_8_8888       ),
+
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, ls_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, ls_composite_over_n_8888         ),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, ls_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, ls_composite_add_8888_8888       ),
+
+/* FIXME: Copy memory are not better than geneic code */
+#if 0
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+#endif
+
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, ls_composite_src_n_8_8888        ),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       ls_composite_add_8_8		   ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       ls_composite_add_n_8_8           ),
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       ls_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       ls_composite_in_n_8_8            ),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ls (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, ls_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca;
+
+/* FIXME blt and fill not shown better perf than geneic code */
+#if 0
+    imp->blt = ls_blt;
+    imp->fill = ls_fill;
+#endif
+
+    return imp;
+}
+
diff -uNra pixman-0.26.0.orig/pixman/pixman-private.h pixman-0.26.0/pixman/pixman-private.h
--- pixman-0.26.0.orig/pixman/pixman-private.h	2012-05-26 08:06:53.000000000 +0800
+++ pixman-0.26.0/pixman/pixman-private.h	2012-12-07 13:19:07.405999589 +0800
@@ -536,6 +536,11 @@
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
 
+#ifdef USE_LOONGSON_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_ls (pixman_implementation_t *fallback);
+#endif
+
 pixman_implementation_t *
 _pixman_implementation_create_noop (pixman_implementation_t *fallback);
 
diff -uNra pixman-0.26.0.orig/test/blitters-test.c pixman-0.26.0/test/blitters-test.c
--- pixman-0.26.0.orig/test/blitters-test.c	2012-05-15 01:40:58.000000000 +0800
+++ pixman-0.26.0/test/blitters-test.c	2012-12-07 13:19:07.409999879 +0800
@@ -67,9 +67,6 @@
 	pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
     }
 
-    if (lcg_rand_n (16) == 0)
-	pixman_image_set_filter (img, PIXMAN_FILTER_BILINEAR, NULL, 0);
-
     image_endian_swap (img);
 
     if (used_fmt) *used_fmt = fmt;
diff -uNra pixman-0.26.0.orig/test/lowlevel-blt-bench.c pixman-0.26.0/test/lowlevel-blt-bench.c
--- pixman-0.26.0.orig/test/lowlevel-blt-bench.c	2012-05-15 01:40:58.000000000 +0800
+++ pixman-0.26.0/test/lowlevel-blt-bench.c	2012-12-07 13:19:07.414000153 +0800
@@ -627,6 +627,7 @@
     { "over_n_8888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "over_n_0565",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "over_8888_8888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
     { "over_8888_8888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
     { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
