--- /dev/null
+Subject: [PATCH] AVR32-optimized string operations
+
+Add hand-optimized AVR32-specific string operations. Some of them
+need a bit more testing, though.
+
+---
+
+ libc/string/avr32/Makefile | 40 +++++++++++
+ libc/string/avr32/bcopy.S | 15 ++++
+ libc/string/avr32/bzero.S | 12 +++
+ libc/string/avr32/memchr.S | 62 +++++++++++++++++
+ libc/string/avr32/memcmp.S | 50 +++++++++++++
+ libc/string/avr32/memcpy.S | 110 ++++++++++++++++++++++++++++++
+ libc/string/avr32/memmove.S | 114 +++++++++++++++++++++++++++++++
+ libc/string/avr32/memset.S | 60 ++++++++++++++++
+ libc/string/avr32/strcat.S | 95 ++++++++++++++++++++++++++
+ libc/string/avr32/strcmp.S | 80 ++++++++++++++++++++++
+ libc/string/avr32/strcpy.S | 63 +++++++++++++++++
+ libc/string/avr32/stringtest.c | 144 ++++++++++++++++++++++++++++++++++++++++
+ libc/string/avr32/strlen.S | 52 ++++++++++++++
+ libc/string/avr32/strncpy.S | 77 +++++++++++++++++++++
+ libc/string/avr32/test_memcpy.c | 66 ++++++++++++++++++
+ 15 files changed, 1040 insertions(+)
+
+Index: uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,15 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++ .text
++ .global bcopy
++ .type bcopy, @function
++ .align 1
++bcopy:
++ /* Swap the first two arguments */
++ eor r11, r12
++ eor r12, r11
++ eor r11, r12
++ rjmp __memmove
++ .size bcopy, . - bcopy
+Index: uClibc-0.9.28-avr32/libc/string/avr32/bzero.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/bzero.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,12 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++ .text
++ .global bzero
++ .type bzero, @function
++ .align 1
++bzero:
++ mov r10, r11
++ mov r11, 0
++ rjmp __memset
+Index: uClibc-0.9.28-avr32/libc/string/avr32/Makefile
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/Makefile 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,40 @@
++# Makefile for uClibc
++#
++# Copyright (C) 2000-2003 Erik Andersen <andersen@uclibc.org>
++#
++# This program is free software; you can redistribute it and/or modify it under
++# the terms of the GNU Library General Public License as published by the Free
++# Software Foundation; either version 2 of the License, or (at your option) any
++# later version.
++#
++# This program is distributed in the hope that it will be useful, but WITHOUT
++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more
++# details.
++#
++# You should have received a copy of the GNU Library General Public License
++# along with this program; if not, write to the Free Software Foundation, Inc.,
++# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++TOPDIR=../../../
++include $(TOPDIR)Rules.mak
++
++SSRC := bcopy.S bzero.S memcmp.S memcpy.S memmove.S
++SSRC += memset.S strcmp.S strlen.S
++# memchr.S, strcat.S, strcpy.S, strncpy.S is broken
++SOBJS := $(patsubst %.S,%.o, $(SSRC))
++OBJS := $(SOBJS)
++
++OBJ_LIST:= ../../obj.string.$(TARGET_ARCH)
++
++all: $(OBJ_LIST)
++
++$(OBJ_LIST): $(OBJS)
++ echo $(addprefix string/$(TARGET_ARCH)/, $(OBJS)) > $@
++
++$(SOBJS): %.o: %.S
++ $(CC) $(ASFLAGS) -c $< -o $@
++ $(STRIPTOOL) -x -R .note -R .comment $@
++
++clean:
++ $(RM) *.[oa] *~ core
+Index: uClibc-0.9.28-avr32/libc/string/avr32/memchr.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/memchr.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,62 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++#define str r12
++#define chr r11
++#define len r10
++
++ .text
++ .global memchr
++ .type memchr, @function
++memchr:
++ or chr, chr, chr << 8
++ or chr, chr, chr << 16
++
++ mov r9, str
++ andl r9, 3, COH
++ brne .Lunaligned_str
++
++1: sub len, 4
++ brlt 2f
++ ld.w r8, str++
++ psub.b r9, r8, r11
++ tnbz r9
++ brne 1b
++
++ sub str, 4
++ bfextu r9, r8, 24, 8
++ cp.b r9, r11
++ reteq str
++ sub str, -1
++ bfextu r9, r8, 16, 8
++ cp.b r9, r11
++ reteq str
++ sub str, -1
++ bfextu r9, r8, 8, 8
++ cp.b r9, r11
++ reteq str
++ sub str, -1
++ retal str
++
++2: sub len, -4
++ reteq 0
++
++3: ld.ub r8, str++
++ cp.w r8, 0
++ reteq str
++ sub len, 1
++ brne 3b
++
++ retal 0
++
++.Lunaligned_str:
++1: sub len, 1
++ retlt 0
++ ld.ub r8, str++
++ cp.b r8, r11
++ reteq str
++ sub r9, 1
++ brge 1b
++
++ rjmp .Laligned_search
+Index: uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S 2006-10-20 10:42:09.000000000 +0200
+@@ -0,0 +1,50 @@
++/*
++ * Copyright (C) 2004 Atmel Norway.
++ */
++
++#define s1 r12
++#define s2 r11
++#define len r10
++
++ .text
++ .global memcmp
++ .type memcmp, @function
++ .align 1
++memcmp:
++ sub len, 4
++ brlt .Lless_than_4
++
++1: ld.w r8, s1++
++ ld.w r9, s2++
++ cp.w r8, r9
++ brne .Lfound_word
++ sub len, 4
++ brge 1b
++
++.Lless_than_4:
++ sub len, -4
++ reteq 0
++
++1: ld.ub r8, s1++
++ ld.ub r9, s2++
++ sub r8, r9
++ retne r8
++ sub len, 1
++ brgt 1b
++
++ retal 0
++
++.Lfound_word:
++ psub.b r9, r8, r9
++ bfextu r8, r9, 24, 8
++ retne r8
++ bfextu r8, r9, 16, 8
++ retne r8
++ bfextu r8, r9, 8, 8
++ retne r8
++ retal r9
++
++ .size memcmp, . - memcmp
++
++ .weak bcmp
++ bcmp = memcmp
+Index: uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,110 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++/* Don't use r12 as dst since we must return it unmodified */
++#define dst r9
++#define src r11
++#define len r10
++
++ .text
++ .global memcpy
++ .type memcpy, @function
++
++ .global __memcpy
++ .hidden __memcpy
++ .type __memcpy, @function
++memcpy:
++__memcpy:
++ pref src[0]
++ mov dst, r12
++
++ /* If we have less than 32 bytes, don't do anything fancy */
++ cp.w len, 32
++ brge .Lmore_than_31
++
++ sub len, 1
++ retlt r12
++1: ld.ub r8, src++
++ st.b dst++, r8
++ sub len, 1
++ brge 1b
++ retal r12
++
++.Lmore_than_31:
++ pushm r0-r7, lr
++
++ /* Check alignment */
++ mov r8, src
++ andl r8, 31, COH
++ brne .Lunaligned_src
++ mov r8, dst
++ andl r8, 3, COH
++ brne .Lunaligned_dst
++
++.Laligned_copy:
++ sub len, 32
++ brlt .Lless_than_32
++
++1: /* Copy 32 bytes at a time */
++ ldm src, r0-r7
++ sub src, -32
++ stm dst, r0-r7
++ sub dst, -32
++ sub len, 32
++ brge 1b
++
++.Lless_than_32:
++ /* Copy 16 more bytes if possible */
++ sub len, -16
++ brlt .Lless_than_16
++ ldm src, r0-r3
++ sub src, -16
++ sub len, 16
++ stm dst, r0-r3
++ sub dst, -16
++
++.Lless_than_16:
++ /* Do the remaining as byte copies */
++ neg len
++ add pc, pc, len << 2
++ .rept 15
++ ld.ub r0, src++
++ st.b dst++, r0
++ .endr
++
++ popm r0-r7, pc
++
++.Lunaligned_src:
++ /* Make src cacheline-aligned. r8 = (src & 31) */
++ rsub r8, r8, 32
++ sub len, r8
++1: ld.ub r0, src++
++ st.b dst++, r0
++ sub r8, 1
++ brne 1b
++
++ /* If dst is word-aligned, we're ready to go */
++ pref src[0]
++ mov r8, 3
++ tst dst, r8
++ breq .Laligned_copy
++
++.Lunaligned_dst:
++ /* src is aligned, but dst is not. Expect bad performance */
++ sub len, 4
++ brlt 2f
++1: ld.w r0, src++
++ st.w dst++, r0
++ sub len, 4
++ brge 1b
++
++2: neg len
++ add pc, pc, len << 2
++ .rept 3
++ ld.ub r0, src++
++ st.b dst++, r0
++ .endr
++
++ popm r0-r7, pc
++ .size memcpy, . - memcpy
+Index: uClibc-0.9.28-avr32/libc/string/avr32/memmove.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/memmove.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,114 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++#define dst r12
++#define src r11
++#define len r10
++
++ .text
++ .global memmove
++ .type memmove, @function
++
++ .global __memmove
++ .hidden __memmove
++ .type __memmove, @function
++memmove:
++__memmove:
++ cp.w src, dst
++ brge __memcpy
++
++ add dst, len
++ add src, len
++ pref src[-1]
++
++ /*
++ * The rest is basically the same as in memcpy.S except that
++ * the direction is reversed.
++ */
++ cp.w len, 32
++ brge .Lmore_than_31
++
++ sub len, 1
++ retlt r12
++1: ld.ub r8, --src
++ st.b --dst, r8
++ sub len, 1
++ brge 1b
++ retal r12
++
++.Lmore_than_31:
++ pushm r0-r7, lr
++
++ /* Check alignment */
++ mov r8, src
++ andl r8, 31, COH
++ brne .Lunaligned_src
++ mov r8, r12
++ andl r8, 3, COH
++ brne .Lunaligned_dst
++
++.Laligned_copy:
++ sub len, 32
++ brlt .Lless_than_32
++
++1: /* Copy 32 bytes at a time */
++ sub src, 32
++ ldm src, r0-r7
++ sub dst, 32
++ sub len, 32
++ stm dst, r0-r7
++ brge 1b
++
++.Lless_than_32:
++ /* Copy 16 more bytes if possible */
++ sub len, -16
++ brlt .Lless_than_16
++ sub src, 16
++ ldm src, r0-r3
++ sub dst, 16
++ sub len, 16
++ stm dst, r0-r3
++
++.Lless_than_16:
++ /* Do the remaining as byte copies */
++ sub len, -16
++ breq 2f
++1: ld.ub r0, --src
++ st.b --dst, r0
++ sub len, 1
++ brne 1b
++
++2: popm r0-r7, pc
++
++.Lunaligned_src:
++ /* Make src cacheline-aligned. r8 = (src & 31) */
++ sub len, r8
++1: ld.ub r0, --src
++ st.b --dst, r0
++ sub r8, 1
++ brne 1b
++
++ /* If dst is word-aligned, we're ready to go */
++ pref src[-4]
++ mov r8, 3
++ tst dst, r8
++ breq .Laligned_copy
++
++.Lunaligned_dst:
++ /* src is aligned, but dst is not. Expect bad performance */
++ sub len, 4
++ brlt 2f
++1: ld.w r0, --src
++ st.w --dst, r0
++ sub len, 4
++ brge 1b
++
++2: neg len
++ add pc, pc, len << 2
++ .rept 3
++ ld.ub r0, --src
++ st.b --dst, r0
++ .endr
++
++ popm r0-r7, pc
+Index: uClibc-0.9.28-avr32/libc/string/avr32/memset.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/memset.S 2006-10-20 10:42:15.000000000 +0200
+@@ -0,0 +1,60 @@
++/*
++ * Copyright (C) 2004 Atmel Norway.
++ */
++
++#define s r12
++#define c r11
++#define n r10
++
++ .text
++ .global memset
++ .type memset, @function
++
++ .global __memset
++ .hidden __memset
++ .type __memset, @function
++
++ .align 1
++memset:
++__memset:
++ cp.w n, 32
++ mov r9, s
++ brge .Llarge_memset
++
++ sub n, 1
++ retlt s
++1: st.b s++, c
++ sub n, 1
++ brge 1b
++
++ retal r9
++
++.Llarge_memset:
++ mov r8, r11
++ mov r11, 3
++ bfins r8, r8, 8, 8
++ bfins r8, r8, 16, 16
++ tst s, r11
++ breq 2f
++
++1: st.b s++, r8
++ sub n, 1
++ tst s, r11
++ brne 1b
++
++2: mov r11, r9
++ mov r9, r8
++ sub n, 8
++
++3: st.d s++, r8
++ sub n, 8
++ brge 3b
++
++ /* If we are done, n == -8 and we'll skip all st.b insns below */
++ neg n
++ lsl n, 1
++ add pc, n
++ .rept 7
++ st.b s++, r8
++ .endr
++ retal r11
+Index: uClibc-0.9.28-avr32/libc/string/avr32/strcat.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/strcat.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,95 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++#define s1 r9
++#define s2 r11
++
++ .text
++ .global strcat
++ .type strcat, @function
++ .align 1
++strcat:
++ mov s1, r12
++
++ /* Make sure s1 is word-aligned */
++ mov r10, s1
++ andl r10, 3, COH
++ breq 2f
++
++ add pc, pc, r10 << 3
++ sub r0, r0, 0 /* 4-byte nop */
++ ld.ub r8, s1++
++ sub r8, r8, 0
++ breq 2f
++ ld.ub r8, s1++
++ sub r8, r8, 0
++ breq 3f
++ ld.ub r8, s1++
++ sub r8, r8, 0
++ breq 4f
++
++ /* Find the end of the first string */
++5: ld.w r8, s1++
++ tnbz r8
++ brne 5b
++
++ sub s1, 4
++
++ bfextu r10, r8, 24, 8
++ cp.w r10, 0
++ breq 1f
++ sub s1, -1
++ bfextu r10, r8, 16, 8
++ cp.w r10, 0
++ breq 2f
++ sub s1, -1
++ bfextu r10, r8, 8, 8
++ cp.w r10, 0
++ breq 3f
++ sub s1, -1
++ rjmp 4f
++
++ /* Now, append s2 */
++1: ld.ub r8, s2++
++ st.b s1++, r8
++ cp.w r8, 0
++ reteq r12
++2: ld.ub r8, s2++
++ st.b s1++, r8
++ cp.w r8, 0
++ reteq r12
++3: ld.ub r8, s2++
++ st.b s1++, r8
++ cp.w r8, 0
++ reteq r12
++4: ld.ub r8, s2++
++ st.b s1++, r8
++ cp.w r8, 0
++ reteq r12
++
++ /* Copy one word at a time */
++ ld.w r8, s2++
++ tnbz r8
++ breq 2f
++1: st.w r8, s2++
++ ld.w r8, s2++
++ tnbz r8
++ brne 1b
++
++ /* Copy the remaining bytes */
++ bfextu r10, r8, 24, 8
++ st.b s1++, r10
++ cp.w r10, 0
++ reteq r12
++ bfextu r10, r8, 16, 8
++ st.b s1++, r10
++ cp.w r10, 0
++ reteq r12
++ bfextu r10, r8, 8, 8
++ st.b s1++, r10
++ cp.w r10, 0
++ reteq r12
++ st.b s1++, r8
++ retal r12
++ .size strcat, . - strcat
+Index: uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,80 @@
++/*
++ * Copyright (C) 2004 Atmel Norway.
++ */
++
++#define s1 r12
++#define s2 r11
++#define len r10
++
++ .text
++ .global strcmp
++ .type strcmp, @function
++ .align 1
++strcmp:
++ mov r8, 3
++ tst s1, r8
++ brne .Lunaligned_s1
++ tst s2, r8
++ brne .Lunaligned_s2
++
++1: ld.w r8, s1++
++ ld.w r9, s2++
++ cp.w r8, r9
++ brne 2f
++ tnbz r8
++ brne 1b
++ retal 0
++
++2: bfextu r12, r8, 24, 8
++ bfextu r11, r9, 24, 8
++ sub r12, r11
++ retne r12
++ cp.w r11, 0
++ reteq 0
++ bfextu r12, r8, 16, 8
++ bfextu r11, r9, 16, 8
++ sub r12, r11
++ retne r12
++ cp.w r11, 0
++ reteq 0
++ bfextu r12, r8, 8, 8
++ bfextu r11, r9, 8, 8
++ sub r12, r11
++ retne r12
++ cp.w r11, 0
++ reteq 0
++ bfextu r12, r8, 0, 8
++ bfextu r11, r9, 0, 8
++ sub r12, r11
++ retal r12
++
++.Lunaligned_s1:
++3: tst s1, r8
++ breq 4f
++ ld.ub r10, s1++
++ ld.ub r9, s2++
++ sub r10, r9
++ retne r10
++ cp.w r9, 0
++ brne 3b
++ retal r10
++
++4: tst s2, r8
++ breq 1b
++
++.Lunaligned_s2:
++ /*
++ * s1 and s2 can't both be aligned, and unaligned word loads
++ * can trigger spurious exceptions if we cross a page boundary.
++ * Do it the slow way...
++ */
++1: ld.ub r8, s1++
++ ld.ub r9, s2++
++ sub r8, r9
++ retne r8
++ cp.w r9, 0
++ brne 1b
++ retal 0
++
++ .weak strcoll
++ strcoll = strcmp
+Index: uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ *
++ * To reduce the size, this one might simply call strncpy with len = -1.
++ */
++
++#define dst r9
++#define src r11
++
++ .text
++ .global strcpy
++ .type strcpy, @function
++strcpy:
++ mov dst, r12
++
++ pref src[0]
++
++ /*
++ * Check alignment. If src is aligned but dst isn't, we can't
++ * do much about it...
++ */
++ mov r8, src
++ andl r8, 3 COH
++ brne .Lunaligned_src
++
++.Laligned_copy:
++1: ld.w r8, src++
++ tnbz r8
++ breq 2f
++ st.w dst++, r8
++ rjmp 1b
++
++2: /*
++ * Ok, r8 now contains the terminating '\0'. Copy the
++ * remaining bytes individually.
++ */
++ bfextu r10, r8, 24, 8
++ st.b dst++, r10
++ cp.w r10, 0
++ reteq r12
++ bfextu r10, r8, 16, 8
++ st.b dst++, r10
++ cp.w r10, 0
++ reteq r12
++ bfextu r10, r8, 8, 8
++ st.b dst++, r10
++ cp.w r10, 0
++ reteq r12
++ st.b dst++, r8
++ retal r12
++
++.Lunaligned_src:
++ /* Copy bytes until we're aligned */
++ rsub r8, r8, 4
++ add pc, pc, r8 << 3
++ nop
++ nop
++ ld.ub r10, src++
++ st.b dst++, r10
++ cp.w r10, 0
++ reteq r12
++
++ rjmp .Laligned_copy
+Index: uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,144 @@
++
++#include <stdio.h>
++#include <string.h>
++#include <time.h>
++#include <sys/mman.h>
++
++#define BUF_SIZE (8 * 1024)
++
++static char *buf1;
++static char *buf1_ref;
++static char *buf2;
++
++extern void *optimized_memcpy(void *dest, void *src, size_t len);
++extern void *optimized_memmove(void *dest, void *src, size_t len);
++extern char *optimized_strcpy(char *dest, char *src);
++extern char *optimized_strncpy(char *dest, char *src, size_t len);
++
++void dump_mismatch(char *buf, char *ref, size_t len)
++{
++ int i, j;
++
++ for (i = 0; i < len; i += 16) {
++ if (memcmp(buf + i, ref + i, 16) == 0)
++ continue;
++
++ printf("%4x buf:", i);
++ for (j = i; j < (i + 16); j++)
++ printf(" %02x", buf[j]);
++ printf("\n ref:");
++ for (j = i; j < (i + 16); j++)
++ printf(" %02x", ref[j]);
++ printf("\n");
++ }
++}
++
++static void test_memcpy(int src_offset, int dst_offset, int len)
++{
++ clock_t start, old, new;
++ int i;
++
++ memset(buf1, 0x55, BUF_SIZE);
++ memset(buf1_ref, 0x55, BUF_SIZE);
++ memset(buf2, 0xaa, BUF_SIZE);
++
++ printf("Testing memcpy with offsets %d => %d and len %d...",
++ src_offset, dst_offset, len);
++
++ start = clock();
++ for (i = 0; i < 8192; i++)
++ optimized_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
++ new = clock() - start;
++ start = clock();
++ for ( i = 0; i < 8192; i++)
++ memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
++ old = clock() - start;
++
++ if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
++ printf("OK\n");
++ else {
++ printf("FAILED\n");
++ dump_mismatch(buf1, buf1_ref, BUF_SIZE);
++ }
++ printf("CPU time used: %d vs. %d\n", new, old);
++}
++
++static void test_memmove(int src_offset, int dst_offset, int len)
++{
++ clock_t start, old, new;
++
++ memset(buf1, 0x55, BUF_SIZE);
++ memset(buf1_ref, 0x55, BUF_SIZE);
++ memset(buf2, 0xaa, BUF_SIZE);
++
++ printf("Testing memmove with offsets %d => %d and len %d...",
++ src_offset, dst_offset, len);
++
++ start = clock();
++ optimized_memmove(buf1 + dst_offset, buf2 + src_offset, len);
++ new = clock() - start;
++ start = clock();
++ memmove(buf1_ref + dst_offset, buf2 + src_offset, len);
++ old = clock() - start;
++
++ if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
++ printf("OK\n");
++ else {
++ printf("FAILED\n");
++ dump_mismatch(buf1, buf1_ref, BUF_SIZE);
++ }
++ printf("CPU time used: %d vs. %d\n", new, old);
++}
++
++int main(int argc, char *argv[])
++{
++ buf2 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
++ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
++ if (buf2 == MAP_FAILED) {
++ perror("Failed to allocate memory for buf2");
++ return 1;
++ }
++ buf1 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
++ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
++ if (buf1 == MAP_FAILED) {
++ perror("Failed to allocate memory for buf1");
++ return 1;
++ }
++ buf1_ref = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
++ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
++ if (buf1_ref == MAP_FAILED) {
++ perror("Failed to allocate memory for buf1_ref");
++ return 1;
++ }
++ printf("\n === MEMCPY ===\n\n");
++
++ test_memcpy(0, 0, BUF_SIZE - 32);
++ test_memcpy(0, 0, 1);
++ test_memcpy(0, 0, 31);
++ test_memcpy(0, 0, 32);
++ test_memcpy(0, 0, 127);
++ test_memcpy(0, 0, 128);
++ test_memcpy(4, 4, BUF_SIZE - 32 - 4);
++ test_memcpy(1, 1, BUF_SIZE - 32 - 1);
++ test_memcpy(1, 1, 126);
++ test_memcpy(0, 3, 128);
++ test_memcpy(1, 4, 128);
++ test_memcpy(0, 0, 0);
++
++ printf("\n === MEMMOVE ===\n\n");
++
++ test_memmove(0, 0, BUF_SIZE - 32);
++ test_memmove(0, 0, 1);
++ test_memmove(0, 0, 31);
++ test_memmove(0, 0, 32);
++ test_memmove(0, 0, BUF_SIZE - 33);
++ test_memmove(0, 0, 128);
++ test_memmove(4, 4, BUF_SIZE - 32 - 4);
++ test_memmove(1, 1, BUF_SIZE - 32 - 1);
++ test_memmove(1, 1, BUF_SIZE - 130);
++ test_memmove(0, 3, BUF_SIZE - 128);
++ test_memmove(1, 4, BUF_SIZE - 128);
++ test_memmove(0, 0, 0);
++
++ return 0;
++}
+Index: uClibc-0.9.28-avr32/libc/string/avr32/strlen.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/strlen.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,52 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++#define str r12
++
++ .text
++ .global strlen
++ .type strlen, @function
++strlen:
++ mov r11, r12
++
++ mov r9, str
++ andl r9, 3, COH
++ brne .Lunaligned_str
++
++1: ld.w r8, str++
++ tnbz r8
++ brne 1b
++
++ sub r12, r11
++ bfextu r9, r8, 24, 8
++ cp.w r9, 0
++ subeq r12, 4
++ reteq r12
++ bfextu r9, r8, 16, 8
++ cp.w r9, 0
++ subeq r12, 3
++ reteq r12
++ bfextu r9, r8, 8, 8
++ cp.w r9, 0
++ subeq r12, 2
++ reteq r12
++ sub r12, 1
++ retal r12
++
++.Lunaligned_str:
++ add pc, pc, r9 << 3
++ sub r0, r0, 0 /* 4-byte nop */
++ ld.ub r8, str++
++ sub r8, r8, 0
++ breq 1f
++ ld.ub r8, str++
++ sub r8, r8, 0
++ breq 1f
++ ld.ub r8, str++
++ sub r8, r8, 0
++ brne 1b
++
++1: sub r12, 1
++ sub r12, r11
++ retal r12
+Index: uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,77 @@
++/*
++ * Copyright (C) 2004 Atmel Norway
++ */
++
++#define dst r9
++#define src r11
++
++ .text
++ .global strcpy
++ .type strncpy, @function
++strncpy:
++ mov dst, r12
++
++ pref src[0]
++ mov dst, r12
++
++ /*
++ * Check alignment. If src is aligned but dst isn't, we can't
++ * do much about it...
++ */
++ mov r8, src
++ andl r8, 3 COH
++ brne .Lunaligned_src
++
++.Laligned_copy:
++ sub r10, 4
++ brlt 3f
++1: ld.w r8, src++
++ tnbz r8
++ breq 2f
++ st.w dst++, r8
++ sub r10, 4
++ brne 1b
++
++3: sub r10, -4
++ reteq r12
++
++ /* This is safe as long as src is word-aligned and r10 > 0 */
++ ld.w r8, src++
++
++2: /*
++ * Ok, r8 now contains the terminating '\0'. Copy the
++ * remaining bytes individually.
++ */
++ bfextu r11, r8, 24, 8
++ st.b dst++, r11
++ cp.w r11, 0
++ reteq r12
++ sub r10, 1
++ reteq r12
++ bfextu r11, r8, 16, 8
++ st.b dst++, r11
++ cp.w r11, 0
++ reteq r12
++ sub r10, 1
++ reteq r12
++ bfextu r11, r8, 8, 8
++ st.b dst++, r11
++ cp.w r11, 0
++ reteq r12
++ sub r10, 1
++ reteq r12
++ st.b dst++, r8
++ retal r12
++
++.Lunaligned_src:
++ /* Copy bytes until we're aligned */
++ min r8, r8, r10
++ sub r10, r8
++ sub r8, 1
++ retlt r12
++1: ld.ub r10, src++
++ st.b dst++, r10
++ sub r8, 1
++ brge 1b
++
++ rjmp .Laligned_copy
+Index: uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c
+===================================================================
+--- /dev/null 1970-01-01 00:00:00.000000000 +0000
++++ uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c 2006-10-19 15:05:52.000000000 +0200
+@@ -0,0 +1,66 @@
++
++#include <stdio.h>
++#include <string.h>
++
++#define BUF_SIZE 32768
++
++static char buf1[BUF_SIZE] __attribute__((aligned(32)));
++static char buf1_ref[BUF_SIZE] __attribute__((aligned(32)));
++static char buf2[BUF_SIZE] __attribute__((aligned(32)));
++
++extern void *new_memcpy(void *dest, void *src, size_t len);
++
++void dump_mismatch(char *buf, char *ref, size_t len)
++{
++ int i, j;
++
++ for (i = 0; i < len; i += 16) {
++ if (memcmp(buf + i, ref + i, 16) == 0)
++ continue;
++
++ printf("% 4x buf:", i);
++ for (j = i; j < (i + 16); j++)
++ printf(" %02x", buf[j]);
++ printf("\n ref:");
++ for (j = i; j < (i + 16); j++)
++ printf(" %02x", ref[j]);
++ printf("\n");
++ }
++}
++
++void test(int src_offset, int dst_offset, int len)
++{
++ memset(buf1, 0x55, sizeof(buf1));
++ memset(buf1_ref, 0x55, sizeof(buf1_ref));
++ memset(buf2, 0xaa, sizeof(buf2));
++
++ printf("Testing with offsets %d => %d and len %d...",
++ src_offset, dst_offset, len);
++
++ new_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
++ memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
++
++ if (memcmp(buf1, buf1_ref, sizeof(buf1)) == 0)
++ printf("OK\n");
++ else {
++ printf("FAILED\n");
++ dump_mismatch(buf1, buf1_ref, sizeof(buf1));
++ }
++}
++
++int main(int argc, char *argv[])
++{
++ test(0, 0, BUF_SIZE);
++ test(0, 0, 1);
++ test(0, 0, 31);
++ test(0, 0, 32);
++ test(0, 0, 127);
++ test(0, 0, 128);
++ test(4, 4, BUF_SIZE - 4);
++ test(1, 1, BUF_SIZE - 1);
++ test(1, 1, 126);
++ test(0, 3, 128);
++ test(1, 4, 128);
++
++ return 0;
++}