读书人

请问ARM中的预取命令PLD的使用

发布时间: 2013-06-25 23:45:41 作者: rapoo

请教ARM中的预取命令PLD的使用
我现在在看android2.3.3提供的关于ARM平台的memcmp这个函数的实现代码,它是用汇编编写的,如下:

   .text

.global __memcmp16
.type __memcmp16, %function
.align 4

/*
* Optimized memcmp16() for ARM9.
* This would not be optimal on XScale or ARM11, where more prefetching
* and use of PLD will be needed.
* The 2 major optimzations here are
* (1) The main loop compares 16 bytes at a time
* (2) The loads are scheduled in a way they won't stall
*/

__memcmp16:
.fnstart
PLD (r0, #0)
PLD (r1, #0)

/* take of the case where length is nul or the buffers are the same */
cmp r0, r1
cmpne r2, #0
moveq r0, #0
bxeq lr

/* since r0 hold the result, move the first source
* pointer somewhere else
*/

mov r3, r0

/* make sure we have at least 12 words, this simplify things below
* and avoid some overhead for small blocks
*/

cmp r2, #12
bpl 0f

/* small blocks (less then 12 words) */
PLD (r0, #32)
PLD (r1, #32)

1: ldrh r0, [r3], #2
ldrh ip, [r1], #2
subs r0, r0, ip
bxne lr
subs r2, r2, #1
bne 1b


bx lr


.save {r4, lr}
/* save registers */
0: stmfd sp!, {r4, lr}

/* align first pointer to word boundary */
tst r3, #2
beq 0f

ldrh r0, [r3], #2
ldrh ip, [r1], #2
sub r2, r2, #1
subs r0, r0, ip
/* restore registers and return */
ldmnefd sp!, {r4, lr}
bxne lr
.fnend



0: /* here the first pointer is aligned, and we have at least 3 words
* to process.
*/

/* see if the pointers are congruent */
eor r0, r3, r1
ands r0, r0, #2
bne 5f

/* congruent case, 16 half-words per iteration
* We need to make sure there are at least 16+2 words left
* because we effectively read ahead one long word, and we could
* read past the buffer (and segfault) if we're not careful.
*/

ldr ip, [r1]
subs r2, r2, #(16 + 2)
bmi 1f

0: ///PLD是宏定义,意思是ARM指令集支持pld命令,就用pld,否则为空
///我不明白的是:这里为什么要把r3+64的地址中的数据取出来,这里好像也没有用到r3+64这个地址中的数据啊????? 请各位忙吧分析下,谢谢了,
PLD (r3, #64)


PLD (r1, #64) ///这里为什么要加64?????
ldr r0, [r3], #4
ldr lr, [r1, #4]!
eors r0, r0, ip
ldreq r0, [r3], #4
ldreq ip, [r1, #4]!
eoreqs r0, r0, lr
ldreq r0, [r3], #4
ldreq lr, [r1, #4]!
eoreqs r0, r0, ip
ldreq r0, [r3], #4
ldreq ip, [r1, #4]!
eoreqs r0, r0, lr
ldreq r0, [r3], #4
ldreq lr, [r1, #4]!
eoreqs r0, r0, ip
ldreq r0, [r3], #4
ldreq ip, [r1, #4]!
eoreqs r0, r0, lr
ldreq r0, [r3], #4
ldreq lr, [r1, #4]!
eoreqs r0, r0, ip
ldreq r0, [r3], #4
ldreq ip, [r1, #4]!
eoreqs r0, r0, lr
bne 2f
subs r2, r2, #16
bhs 0b

/* do we have at least 2 words left? */
1: adds r2, r2, #(16 - 2 + 2)
bmi 4f

/* finish off 2 words at a time */


3: ldr r0, [r3], #4
ldr ip, [r1], #4
eors r0, r0, ip
bne 2f
subs r2, r2, #2
bhs 3b

/* are we done? */
4: adds r2, r2, #2
bne 8f
/* restore registers and return */
mov r0, #0
ldmfd sp!, {r4, lr}
bx lr

2: /* the last 2 words are different, restart them */
ldrh r0, [r3, #-4]
ldrh ip, [r1, #-4]
subs r0, r0, ip
ldreqh r0, [r3, #-2]
ldreqh ip, [r1, #-2]
subeqs r0, r0, ip
/* restore registers and return */
ldmfd sp!, {r4, lr}
bx lr

/* process the last few words */
8: ldrh r0, [r3], #2
ldrh ip, [r1], #2
subs r0, r0, ip
bne 9f
subs r2, r2, #1
bne 8b

9: /* restore registers and return */
ldmfd sp!, {r4, lr}
bx lr


5: /*************** non-congruent case ***************/



/* align the unaligned pointer */
bic r1, r1, #3
ldr lr, [r1], #4
sub r2, r2, #8

6:
PLD (r3, #64)
PLD (r1, #64)
mov ip, lr, lsr #16
ldr lr, [r1], #4
ldr r0, [r3], #4
orr ip, ip, lr, lsl #16
eors r0, r0, ip
moveq ip, lr, lsr #16
ldreq lr, [r1], #4
ldreq r0, [r3], #4
orreq ip, ip, lr, lsl #16
eoreqs r0, r0, ip
moveq ip, lr, lsr #16
ldreq lr, [r1], #4
ldreq r0, [r3], #4
orreq ip, ip, lr, lsl #16
eoreqs r0, r0, ip
moveq ip, lr, lsr #16
ldreq lr, [r1], #4
ldreq r0, [r3], #4
orreq ip, ip, lr, lsl #16
eoreqs r0, r0, ip
bne 7f
subs r2, r2, #8
bhs 6b
sub r1, r1, #2
/* are we done? */
adds r2, r2, #8
moveq r0, #0


beq 9b
/* finish off the remaining bytes */
b 8b

7: /* fix up the 2 pointers and fallthrough... */
sub r1, r1, #2
b 2b


[解决办法]
Cache未命中时的分配策略:读操作分配策略、读/写策略分配策略。
a). 读操作分配策略,当Cache未命中时,只有进行存储器读操作时,才分配Cache行。如果被替换的Cache行包含有效数据,那么在该行被新的数据填充之前,要先把原理的内容写入到主存中去。采用读操作分配策略时,存储器写操作不会更新Cache行,除非相关的Cache行恰好是前一个主存读操作刚分配的。
b). 采用读/写分配策略,无论存储器读还是写操作,在Cache未命中时,都将分配Cache行。对于存储器写操作,如果Cache未命中,将分配一个 Cache行。如果被替换的Cache行中包含有效数据,控制器会先将该行数据写入主存,再用从主存读取的数据将改行Cache覆盖,最后把内核数据写入该Cache行中。如果采用Cache直写策略,内核数据将会同时被写入到主存中。
因此:一次性取多少数据取决于Cache行的大小,也就是Cache块的大小,但我有一个疑问,数据已经在寄存器中了,为什么还要预读取?

读书人网 >汇编语言

热点推荐