target/linux/omap24xx/patches-2.6.36/100-optimized-arm-div.patch

   1 ---
   2  arch/arm/boot/compressed/lib1funcs.S |  348 +++++++++++++++++++++++++++++++++++
   3  1 file changed, 348 insertions(+)
   4
   5 --- /dev/null
   6 +++ linux-2.6.35/arch/arm/boot/compressed/lib1funcs.S
   7 @@ -0,0 +1,348 @@
   8 +/*
   9 + * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
  10 + *
  11 + * Author: Nicolas Pitre <nico@fluxnic.net>
  12 + *   - contributed to gcc-3.4 on Sep 30, 2003
  13 + *   - adapted for the Linux kernel on Oct 2, 2003
  14 + */
  15 +
  16 +/* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
  17 +
  18 +This file is free software; you can redistribute it and/or modify it
  19 +under the terms of the GNU General Public License as published by the
  20 +Free Software Foundation; either version 2, or (at your option) any
  21 +later version.
  22 +
  23 +In addition to the permissions in the GNU General Public License, the
  24 +Free Software Foundation gives you unlimited permission to link the
  25 +compiled version of this file into combinations with other programs,
  26 +and to distribute those combinations without any restriction coming
  27 +from the use of this file.  (The General Public License restrictions
  28 +do apply in other respects; for example, they cover modification of
  29 +the file, and distribution when not linked into a combine
  30 +executable.)
  31 +
  32 +This file is distributed in the hope that it will be useful, but
  33 +WITHOUT ANY WARRANTY; without even the implied warranty of
  34 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  35 +General Public License for more details.
  36 +
  37 +You should have received a copy of the GNU General Public License
  38 +along with this program; see the file COPYING.  If not, write to
  39 +the Free Software Foundation, 59 Temple Place - Suite 330,
  40 +Boston, MA 02111-1307, USA.  */
  41 +
  42 +
  43 +#include <linux/linkage.h>
  44 +#include <asm/assembler.h>
  45 +
  46 +
  47 +.macro ARM_DIV_BODY dividend, divisor, result, curbit
  48 +
  49 +#if __LINUX_ARM_ARCH__ >= 5
  50 +
  51 +       clz     \curbit, \divisor
  52 +       clz     \result, \dividend
  53 +       sub     \result, \curbit, \result
  54 +       mov     \curbit, #1
  55 +       mov     \divisor, \divisor, lsl \result
  56 +       mov     \curbit, \curbit, lsl \result
  57 +       mov     \result, #0
  58 +
  59 +#else
  60 +
  61 +       @ Initially shift the divisor left 3 bits if possible,
  62 +       @ set curbit accordingly.  This allows for curbit to be located
  63 +       @ at the left end of each 4 bit nibbles in the division loop
  64 +       @ to save one loop in most cases.
  65 +       tst     \divisor, #0xe0000000
  66 +       moveq   \divisor, \divisor, lsl #3
  67 +       moveq   \curbit, #8
  68 +       movne   \curbit, #1
  69 +
  70 +       @ Unless the divisor is very big, shift it up in multiples of
  71 +       @ four bits, since this is the amount of unwinding in the main
  72 +       @ division loop.  Continue shifting until the divisor is
  73 +       @ larger than the dividend.
  74 +1:     cmp     \divisor, #0x10000000
  75 +       cmplo   \divisor, \dividend
  76 +       movlo   \divisor, \divisor, lsl #4
  77 +       movlo   \curbit, \curbit, lsl #4
  78 +       blo     1b
  79 +
  80 +       @ For very big divisors, we must shift it a bit at a time, or
  81 +       @ we will be in danger of overflowing.
  82 +1:     cmp     \divisor, #0x80000000
  83 +       cmplo   \divisor, \dividend
  84 +       movlo   \divisor, \divisor, lsl #1
  85 +       movlo   \curbit, \curbit, lsl #1
  86 +       blo     1b
  87 +
  88 +       mov     \result, #0
  89 +
  90 +#endif
  91 +
  92 +       @ Division loop
  93 +1:     cmp     \dividend, \divisor
  94 +       subhs   \dividend, \dividend, \divisor
  95 +       orrhs   \result,   \result,   \curbit
  96 +       cmp     \dividend, \divisor,  lsr #1
  97 +       subhs   \dividend, \dividend, \divisor, lsr #1
  98 +       orrhs   \result,   \result,   \curbit,  lsr #1
  99 +       cmp     \dividend, \divisor,  lsr #2
 100 +       subhs   \dividend, \dividend, \divisor, lsr #2
 101 +       orrhs   \result,   \result,   \curbit,  lsr #2
 102 +       cmp     \dividend, \divisor,  lsr #3
 103 +       subhs   \dividend, \dividend, \divisor, lsr #3
 104 +       orrhs   \result,   \result,   \curbit,  lsr #3
 105 +       cmp     \dividend, #0                   @ Early termination?
 106 +       movnes  \curbit,   \curbit,  lsr #4     @ No, any more bits to do?
 107 +       movne   \divisor,  \divisor, lsr #4
 108 +       bne     1b
 109 +
 110 +.endm
 111 +
 112 +
 113 +.macro ARM_DIV2_ORDER divisor, order
 114 +
 115 +#if __LINUX_ARM_ARCH__ >= 5
 116 +
 117 +       clz     \order, \divisor
 118 +       rsb     \order, \order, #31
 119 +
 120 +#else
 121 +
 122 +       cmp     \divisor, #(1 << 16)
 123 +       movhs   \divisor, \divisor, lsr #16
 124 +       movhs   \order, #16
 125 +       movlo   \order, #0
 126 +
 127 +       cmp     \divisor, #(1 << 8)
 128 +       movhs   \divisor, \divisor, lsr #8
 129 +       addhs   \order, \order, #8
 130 +
 131 +       cmp     \divisor, #(1 << 4)
 132 +       movhs   \divisor, \divisor, lsr #4
 133 +       addhs   \order, \order, #4
 134 +
 135 +       cmp     \divisor, #(1 << 2)
 136 +       addhi   \order, \order, #3
 137 +       addls   \order, \order, \divisor, lsr #1
 138 +
 139 +#endif
 140 +
 141 +.endm
 142 +
 143 +
 144 +.macro ARM_MOD_BODY dividend, divisor, order, spare
 145 +
 146 +#if __LINUX_ARM_ARCH__ >= 5
 147 +
 148 +       clz     \order, \divisor
 149 +       clz     \spare, \dividend
 150 +       sub     \order, \order, \spare
 151 +       mov     \divisor, \divisor, lsl \order
 152 +
 153 +#else
 154 +
 155 +       mov     \order, #0
 156 +
 157 +       @ Unless the divisor is very big, shift it up in multiples of
 158 +       @ four bits, since this is the amount of unwinding in the main
 159 +       @ division loop.  Continue shifting until the divisor is
 160 +       @ larger than the dividend.
 161 +1:     cmp     \divisor, #0x10000000
 162 +       cmplo   \divisor, \dividend
 163 +       movlo   \divisor, \divisor, lsl #4
 164 +       addlo   \order, \order, #4
 165 +       blo     1b
 166 +
 167 +       @ For very big divisors, we must shift it a bit at a time, or
 168 +       @ we will be in danger of overflowing.
 169 +1:     cmp     \divisor, #0x80000000
 170 +       cmplo   \divisor, \dividend
 171 +       movlo   \divisor, \divisor, lsl #1
 172 +       addlo   \order, \order, #1
 173 +       blo     1b
 174 +
 175 +#endif
 176 +
 177 +       @ Perform all needed substractions to keep only the reminder.
 178 +       @ Do comparisons in batch of 4 first.
 179 +       subs    \order, \order, #3              @ yes, 3 is intended here
 180 +       blt     2f
 181 +
 182 +1:     cmp     \dividend, \divisor
 183 +       subhs   \dividend, \dividend, \divisor
 184 +       cmp     \dividend, \divisor,  lsr #1
 185 +       subhs   \dividend, \dividend, \divisor, lsr #1
 186 +       cmp     \dividend, \divisor,  lsr #2
 187 +       subhs   \dividend, \dividend, \divisor, lsr #2
 188 +       cmp     \dividend, \divisor,  lsr #3
 189 +       subhs   \dividend, \dividend, \divisor, lsr #3
 190 +       cmp     \dividend, #1
 191 +       mov     \divisor, \divisor, lsr #4
 192 +       subges  \order, \order, #4
 193 +       bge     1b
 194 +
 195 +       tst     \order, #3
 196 +       teqne   \dividend, #0
 197 +       beq     5f
 198 +
 199 +       @ Either 1, 2 or 3 comparison/substractions are left.
 200 +2:     cmn     \order, #2
 201 +       blt     4f
 202 +       beq     3f
 203 +       cmp     \dividend, \divisor
 204 +       subhs   \dividend, \dividend, \divisor
 205 +       mov     \divisor,  \divisor,  lsr #1
 206 +3:     cmp     \dividend, \divisor
 207 +       subhs   \dividend, \dividend, \divisor
 208 +       mov     \divisor,  \divisor,  lsr #1
 209 +4:     cmp     \dividend, \divisor
 210 +       subhs   \dividend, \dividend, \divisor
 211 +5:
 212 +.endm
 213 +
 214 +
 215 +ENTRY(__udivsi3)
 216 +ENTRY(__aeabi_uidiv)
 217 +
 218 +       subs    r2, r1, #1
 219 +       moveq   pc, lr
 220 +       bcc     Ldiv0
 221 +       cmp     r0, r1
 222 +       bls     11f
 223 +       tst     r1, r2
 224 +       beq     12f
 225 +
 226 +       ARM_DIV_BODY r0, r1, r2, r3
 227 +
 228 +       mov     r0, r2
 229 +       mov     pc, lr
 230 +
 231 +11:    moveq   r0, #1
 232 +       movne   r0, #0
 233 +       mov     pc, lr
 234 +
 235 +12:    ARM_DIV2_ORDER r1, r2
 236 +
 237 +       mov     r0, r0, lsr r2
 238 +       mov     pc, lr
 239 +
 240 +ENDPROC(__udivsi3)
 241 +ENDPROC(__aeabi_uidiv)
 242 +
 243 +ENTRY(__umodsi3)
 244 +
 245 +       subs    r2, r1, #1                      @ compare divisor with 1
 246 +       bcc     Ldiv0
 247 +       cmpne   r0, r1                          @ compare dividend with divisor
 248 +       moveq   r0, #0
 249 +       tsthi   r1, r2                          @ see if divisor is power of 2
 250 +       andeq   r0, r0, r2
 251 +       movls   pc, lr
 252 +
 253 +       ARM_MOD_BODY r0, r1, r2, r3
 254 +
 255 +       mov     pc, lr
 256 +
 257 +ENDPROC(__umodsi3)
 258 +
 259 +ENTRY(__divsi3)
 260 +ENTRY(__aeabi_idiv)
 261 +
 262 +       cmp     r1, #0
 263 +       eor     ip, r0, r1                      @ save the sign of the result.
 264 +       beq     Ldiv0
 265 +       rsbmi   r1, r1, #0                      @ loops below use unsigned.
 266 +       subs    r2, r1, #1                      @ division by 1 or -1 ?
 267 +       beq     10f
 268 +       movs    r3, r0
 269 +       rsbmi   r3, r0, #0                      @ positive dividend value
 270 +       cmp     r3, r1
 271 +       bls     11f
 272 +       tst     r1, r2                          @ divisor is power of 2 ?
 273 +       beq     12f
 274 +
 275 +       ARM_DIV_BODY r3, r1, r0, r2
 276 +
 277 +       cmp     ip, #0
 278 +       rsbmi   r0, r0, #0
 279 +       mov     pc, lr
 280 +
 281 +10:    teq     ip, r0                          @ same sign ?
 282 +       rsbmi   r0, r0, #0
 283 +       mov     pc, lr
 284 +
 285 +11:    movlo   r0, #0
 286 +       moveq   r0, ip, asr #31
 287 +       orreq   r0, r0, #1
 288 +       mov     pc, lr
 289 +
 290 +12:    ARM_DIV2_ORDER r1, r2
 291 +
 292 +       cmp     ip, #0
 293 +       mov     r0, r3, lsr r2
 294 +       rsbmi   r0, r0, #0
 295 +       mov     pc, lr
 296 +
 297 +ENDPROC(__divsi3)
 298 +ENDPROC(__aeabi_idiv)
 299 +
 300 +ENTRY(__modsi3)
 301 +
 302 +       cmp     r1, #0
 303 +       beq     Ldiv0
 304 +       rsbmi   r1, r1, #0                      @ loops below use unsigned.
 305 +       movs    ip, r0                          @ preserve sign of dividend
 306 +       rsbmi   r0, r0, #0                      @ if negative make positive
 307 +       subs    r2, r1, #1                      @ compare divisor with 1
 308 +       cmpne   r0, r1                          @ compare dividend with divisor
 309 +       moveq   r0, #0
 310 +       tsthi   r1, r2                          @ see if divisor is power of 2
 311 +       andeq   r0, r0, r2
 312 +       bls     10f
 313 +
 314 +       ARM_MOD_BODY r0, r1, r2, r3
 315 +
 316 +10:    cmp     ip, #0
 317 +       rsbmi   r0, r0, #0
 318 +       mov     pc, lr
 319 +
 320 +ENDPROC(__modsi3)
 321 +
 322 +#ifdef CONFIG_AEABI
 323 +
 324 +ENTRY(__aeabi_uidivmod)
 325 +
 326 +       stmfd   sp!, {r0, r1, ip, lr}
 327 +       bl      __aeabi_uidiv
 328 +       ldmfd   sp!, {r1, r2, ip, lr}
 329 +       mul     r3, r0, r2
 330 +       sub     r1, r1, r3
 331 +       mov     pc, lr
 332 +
 333 +ENDPROC(__aeabi_uidivmod)
 334 +
 335 +ENTRY(__aeabi_idivmod)
 336 +
 337 +       stmfd   sp!, {r0, r1, ip, lr}
 338 +       bl      __aeabi_idiv
 339 +       ldmfd   sp!, {r1, r2, ip, lr}
 340 +       mul     r3, r0, r2
 341 +       sub     r1, r1, r3
 342 +       mov     pc, lr
 343 +
 344 +ENDPROC(__aeabi_idivmod)
 345 +
 346 +#endif
 347 +
 348 +Ldiv0:
 349 +
 350 +       str     lr, [sp, #-8]!
 351 +       bl      __div0
 352 +       mov     r0, #0                  @ About as wrong as it could be.
 353 +       ldr     pc, [sp], #8
 354 +
 355 +