mirror of
				https://github.com/RaySollium99/picodrive.git
				synced 2025-10-27 21:48:50 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			432 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			432 lines
		
	
	
	
		
			11 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| @ assembly "optimized" blitter and copy functions
 | |
| @ all pointers must be word-aligned
 | |
| 
 | |
| @ (c) Copyright 2006, notaz
 | |
| @ All Rights Reserved
 | |
| 
 | |
| 
 | |
| @ Convert 0000bbb0 ggg0rrr0
 | |
| @ to      0000rrr0 ggg0bbb0
 | |
| 
 | |
| @ r2,r3 - scratch, lr = 0x000F000F
 | |
| .macro convRGB444 reg
 | |
|     and     r2,   \reg, lr         @ r2=red
 | |
|     and     r3,   \reg, lr, lsl #8 @ r3=blue
 | |
|     and     \reg, \reg, lr, lsl #4 @ green stays in place
 | |
|     orr     \reg, \reg, r2, lsl #8 @ add red back
 | |
|     orr     \reg, \reg, r3, lsr #8 @ add blue back
 | |
| .endm
 | |
| 
 | |
| .global vidConvCpyRGB444 @ void *to, void *from, int pixels
 | |
| 
 | |
| vidConvCpyRGB444:
 | |
|     stmfd   sp!, {r4-r11,lr}
 | |
| 
 | |
|     mov     r12, r2, lsr #4 @ repeats
 | |
|     mov     lr, #0xF0000
 | |
|     orr     lr, lr, #0xF    @ lr == pattern 0x000F000F
 | |
| 
 | |
| 
 | |
| .loopRGB444:
 | |
| 	subs    r12, r12, #1
 | |
| 
 | |
|     @ I first thought storing multiple registers would be faster,
 | |
|     @ but this doesn't seem to be the case, probably because of
 | |
|     @ slow video memory we are dealing with
 | |
|  	ldmia	r1!, {r4-r11}
 | |
|     convRGB444 r4
 | |
|     str     r4, [r0], #4
 | |
|     convRGB444 r5
 | |
|     str     r5, [r0], #4
 | |
|     convRGB444 r6
 | |
|     str     r6, [r0], #4
 | |
|     convRGB444 r7
 | |
|     str     r7, [r0], #4
 | |
|     convRGB444 r8
 | |
|     str     r8, [r0], #4
 | |
|     convRGB444 r9
 | |
|     str     r9, [r0], #4
 | |
|     convRGB444 r10
 | |
|     str     r10, [r0], #4
 | |
|     convRGB444 r11
 | |
|     str     r11, [r0], #4
 | |
| 
 | |
|     bgt     .loopRGB444
 | |
| 
 | |
| 
 | |
|     ldmfd   sp!, {r4-r11,lr}
 | |
|     bx      lr
 | |
| 
 | |
| 
 | |
| @ Convert 0000bbb0 ggg0rrr0
 | |
| @ to      rrr00ggg 000bbb00
 | |
| 
 | |
| @ r2,r3 - scratch, lr = 0x07800780
 | |
| .macro convRGB565 reg
 | |
|     and     r2,   \reg, lr,  lsr #7  @ r2=red
 | |
|     and     r3,   \reg, lr,  lsl #1  @ r3=blue
 | |
|     and     \reg, lr,   \reg,lsl #3  @ green stays, but needs shifting
 | |
|     orr     \reg, \reg, r2,  lsl #12 @ add red back
 | |
|     orr     \reg, \reg, r3,  lsr #7  @ add blue back
 | |
| .endm
 | |
| 
 | |
| .global vidConvCpyRGB565 @ void *to, void *from, int pixels
 | |
| 
 | |
| vidConvCpyRGB565:
 | |
|     stmfd   sp!, {r4-r11,lr}
 | |
| 
 | |
|     mov     r12, r2, lsr #4 @ repeats
 | |
|     mov     lr, #0x07800000
 | |
|     orr     lr, lr, #0x780  @ lr == pattern 0x07800780
 | |
| 
 | |
| .loopRGB565:
 | |
| 	subs    r12, r12, #1
 | |
| 
 | |
|  	ldmia	r1!, {r4-r11}
 | |
|     convRGB565 r4
 | |
|     str     r4, [r0], #4
 | |
|     convRGB565 r5
 | |
|     str     r5, [r0], #4
 | |
|     convRGB565 r6
 | |
|     str     r6, [r0], #4
 | |
|     convRGB565 r7
 | |
|     str     r7, [r0], #4
 | |
|     convRGB565 r8
 | |
|     str     r8, [r0], #4
 | |
|     convRGB565 r9
 | |
|     str     r9, [r0], #4
 | |
|     convRGB565 r10
 | |
|     str     r10, [r0], #4
 | |
|     convRGB565 r11
 | |
|     str     r11, [r0], #4
 | |
| 
 | |
|     bgt     .loopRGB565
 | |
| 
 | |
|     ldmfd   sp!, {r4-r11,lr}
 | |
|     bx      lr
 | |
| 
 | |
| 
 | |
| @ Convert 0000bbb0 ggg0rrr0 0000bbb0 ggg0rrr0
 | |
| @ to      00000000 rrr00000 ggg00000 bbb00000 ...
 | |
| 
 | |
| @ r2,r3 - scratch, lr = 0x0000F000
 | |
| @ rin - src reg, rout - dest reg (can be same for both; rout can be r3)
 | |
| .macro convRGB32_l rout rin
 | |
|     and     r2,    \rin,  lr,   lsr #12 @ r2=red
 | |
|     and     r3,    \rin,  lr,   lsr #4  @ r3=blue
 | |
|     orr     r2,    r3,    r2,   lsl #24
 | |
|     and     \rout, lr,    \rin, lsl #8  @ green stays, but needs shifting
 | |
|     orr     \rout, \rout, r2,   lsr #4  @ add red+blue back
 | |
| .endm
 | |
| 
 | |
| @ r2,r3 - scratch, lr = 0x0000F000
 | |
| @ rin - src reg, rout - dest reg (can be same for both; rout can be r3)
 | |
| .macro convRGB32_h rout rin
 | |
|     and     r2,    \rin,  lr,   lsl #4  @ r2=red
 | |
|     mov     r3,    \rin,        lsr #24 @ r3=blue
 | |
|     orr     r2,    r3,    r2
 | |
|     and     \rout, lr,    \rin, lsr #8  @ green
 | |
|     orr     \rout, \rout, r2,   lsl #4
 | |
| .endm
 | |
| 
 | |
| @ slightly faster conversion, saves 1 opcode, writes output
 | |
| @ lr =  0x00F000F0, out: r3=lower_pix, r2=higher_pix; trashes rin
 | |
| .macro convRGB32_2 rin rethigh=0
 | |
|     and     r2,  lr, \rin, lsr #4 @ blue
 | |
|     and     r3,  \rin, lr
 | |
|     orr     r2,  r2,   r3, lsl #8         @ g0b0g0b0
 | |
| 
 | |
|     mov     r3,  r2,  lsl #16             @ g0b00000
 | |
|     and     \rin,lr,  \rin, ror #12       @ 00r000r0 (reversed)
 | |
|     orr     r3,  r3,  \rin, lsr #16       @ g0b000r0
 | |
|     mov     r3,  r3,  ror #16             @ r3=low
 | |
| 
 | |
|     str     r3, [r0], #4
 | |
| 
 | |
|     mov     r2,  r2,  lsr #16
 | |
| .if \rethigh
 | |
|     orr     \rin,r2,  \rin, lsl #16
 | |
| .else
 | |
|     orr     r2,  r2,  \rin, lsl #16
 | |
|     str     r2, [r0], #4
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .global vidConvCpyRGB32 @ void *to, void *from, int pixels
 | |
| 
 | |
| vidConvCpyRGB32:
 | |
|     stmfd   sp!, {r4-r7,lr}
 | |
| 
 | |
|     mov     r12, r2, lsr #3 @ repeats
 | |
|     mov     lr, #0x00F00000
 | |
|     orr     lr, lr, #0x00F0
 | |
| 
 | |
| .loopRGB32:
 | |
| 	subs    r12, r12, #1
 | |
| 
 | |
|  	ldmia	r1!, {r4-r7}
 | |
|     convRGB32_2 r4
 | |
|     convRGB32_2 r5
 | |
|     convRGB32_2 r6
 | |
|     convRGB32_2 r7
 | |
| 
 | |
|     bgt     .loopRGB32
 | |
| 
 | |
|     ldmfd   sp!, {r4-r7,lr}
 | |
|     bx      lr
 | |
| 
 | |
| 
 | |
| @ -------- M2 stuff ---------
 | |
| 
 | |
| .bss
 | |
| tmpstore1d: .long
 | |
| 
 | |
| .text
 | |
| tmpstore1:  .long tmpstore1d
 | |
| 
 | |
| 
 | |
| @ r3 - scratch, ru - reg with 2 pixels from upper col, rl - ... lower col
 | |
| .macro rot_str16_90 ru rl
 | |
|     mov     r3, \rl,lsl #16
 | |
|     mov     r3, r3, lsr #16
 | |
|     orr     r3, r3, \ru, lsl #16
 | |
|     str     r3, [r0], #208*2
 | |
|     mov     r3, \ru,lsr #16
 | |
|     mov     r3, r3, lsl #16
 | |
|     orr     r3, r3, \rl, lsr #16
 | |
|     str     r3, [r0], #208*2
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .global vidConvCpyM2_16_90 @ void *to, void *from, int width
 | |
| 
 | |
| vidConvCpyM2_16_90:
 | |
|     stmfd   sp!, {r4-r11,lr}
 | |
| 
 | |
|     ldr     r4, =tmpstore1
 | |
|     str     sp, [r4]               @ save sp, we will need sp reg..
 | |
|     mov     sp, r0                 @ .. to store our dst
 | |
| 
 | |
|     @ crashing beyond this point will be fatal (phone reboots), as Symbian OS expects sp to always point to stack
 | |
| 
 | |
|     sub     r2,  r2, #1
 | |
|     mov     r12, #0x00670000
 | |
|     orr     r12, r12, r2, lsl #24
 | |
|     orr     r12, r12, r2           @ r12 == ((208-2)/2 << 16) | ((width-1)<<24) | (width-1)
 | |
| 
 | |
|     add     r0,  r0, #206*2
 | |
|     add     r1,  r1, #8*2          @ skip left border
 | |
|     add     lr,  r1, #328*2
 | |
| 
 | |
| .loopM2_16_90:
 | |
| 	subs    r12, r12, #1<<24
 | |
| 
 | |
|  	ldmia	r1!, {r4-r7}
 | |
|  	ldmia	lr!, {r8-r11}
 | |
|     rot_str16_90 r4 r8
 | |
|     rot_str16_90 r5 r9
 | |
|     rot_str16_90 r6 r10
 | |
|     rot_str16_90 r7 r11
 | |
| 
 | |
|     bpl     .loopM2_16_90
 | |
| 
 | |
|     add     r12, r12, #1<<24
 | |
|     subs    r12, r12, #0x00010000
 | |
|     bmi     .loopM2_16_90_end
 | |
| 
 | |
|     add     r0,  sp,  r12, lsr #14 @ calculate new dst pointer
 | |
|     orr     r12, r12, r12, lsl #24 @ restore the width counter
 | |
| 
 | |
|     @ skip remaining pixels on these 2 lines
 | |
|     mov     r4, #328/8-1         @ width of mode2 in line_pixels/8
 | |
|     sub     r4, r4, r12, lsr #24
 | |
|     add     r1, lr, r4,  lsl #4  @ skip src pixels
 | |
|     add     lr, r1, #328*2
 | |
|     b       .loopM2_16_90
 | |
| 
 | |
| .loopM2_16_90_end:
 | |
|     @ restore sp
 | |
|     ldr     r4, =tmpstore1
 | |
|     ldr     sp, [r4]
 | |
| 
 | |
|     ldmfd   sp!, {r4-r11,lr}
 | |
|     bx      lr
 | |
| 
 | |
| 
 | |
| 
 | |
| @ r3 - scratch, ru - reg with 2 pixels from upper col, rl - ... lower col (for right-to-left copies)
 | |
| .macro rot_str16_270 ru rl
 | |
|     mov     r3, \rl,lsr #16
 | |
|     mov     r3, r3, lsl #16
 | |
|     orr     r3, r3, \ru, lsr #16
 | |
|     str     r3, [r0], #208*2
 | |
|     mov     r3, \ru,lsl #16
 | |
|     mov     r3, r3, lsr #16
 | |
|     orr     r3, r3, \rl, lsl #16
 | |
|     str     r3, [r0], #208*2
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .global vidConvCpyM2_16_270 @ void *to, void *from, int width
 | |
| 
 | |
| vidConvCpyM2_16_270:
 | |
|     stmfd   sp!, {r4-r11,lr}
 | |
| 
 | |
|     ldr     r4, =tmpstore1
 | |
|     str     sp, [r4]               @ save sp, we will need sp reg to store our dst
 | |
| 
 | |
|     sub     r2,  r2, #1
 | |
|     mov     r12, #0x00670000
 | |
|     orr     r12, r12, r2, lsl #24
 | |
|     orr     r12, r12, r2           @ r12 == ((208-2)/2 << 16) | ((width-1)<<24) | (width-1)
 | |
| 
 | |
|     add     r1,  r1, #328*2        @ skip left border+1line
 | |
|     add     lr,  r1, #328*2
 | |
|     add     sp,  r0, #206*2        @ adjust for algo
 | |
| 
 | |
| .loopM2_16_270:
 | |
| 	subs    r12, r12, #1<<24
 | |
| 
 | |
|  	ldmdb	r1!, {r4-r7}
 | |
|  	ldmdb	lr!, {r8-r11}
 | |
|     rot_str16_270 r7 r11           @ update the screen in incrementing direction, reduces tearing slightly
 | |
|     rot_str16_270 r6 r10
 | |
|     rot_str16_270 r5 r9
 | |
|     rot_str16_270 r4 r8
 | |
| 
 | |
|     bpl     .loopM2_16_270
 | |
| 
 | |
|     add     r12, r12, #1<<24
 | |
|     subs    r12, r12, #0x00010000
 | |
|     bmi     .loopM2_16_90_end      @ same end as in 90
 | |
| 
 | |
|     sub     r0,  sp,  r12, lsr #14 @ calculate new dst pointer
 | |
|     orr     r12, r12, r12, lsl #24 @ restore the width counter
 | |
| 
 | |
|     @ skip remaining pixels on these 2 lines
 | |
|     mov     r4, #328/8-1         @ width of mode2 in line_pixels/8
 | |
|     sub     r4, r4, r12, lsr #24
 | |
|     sub     r1, lr, r4,  lsl #4  @ skip src pixels
 | |
|     add     r1, r1, #328*2*2
 | |
|     add     lr, r1, #328*2
 | |
|     b       .loopM2_16_270
 | |
| 
 | |
| 
 | |
| 
 | |
| .global vidConvCpyM2_RGB32_90 @ void *to, void *from, int width
 | |
| 
 | |
| vidConvCpyM2_RGB32_90:
 | |
|     stmfd   sp!, {r4-r10,lr}
 | |
| 
 | |
|     mov     lr, #0x00F00000
 | |
|     orr     lr, lr, #0x00F0
 | |
| 
 | |
|     mov     r12, #208/4            @ row counter
 | |
|     mov     r10, r2, lsl #2        @ we do 2 pixel wide copies
 | |
| 
 | |
|     add     r8,  r0, #208*4        @ parallel line
 | |
|     add     r1,  r1, #0x21000
 | |
|     add     r1,  r1, #0x00280      @ r1+=328*207*2+8*2
 | |
|     mov     r9,  r1
 | |
| 
 | |
| .loopM2RGB32_90:
 | |
| 	subs    r12, r12, #1
 | |
| 
 | |
|     @ at first this loop was written differently: src pixels were fetched with ldm's and
 | |
|     @ dest was not sequential. It ran nearly 2 times slower. It seems it is very important
 | |
|     @ to do sequential memory access on those items, which we have more (to offload addressing bus?).
 | |
| 
 | |
|     ldr     r4, [r1], #-328*2
 | |
|     ldr     r5, [r1], #-328*2
 | |
|     ldr     r6, [r1], #-328*2
 | |
|     ldr     r7, [r1], #-328*2
 | |
| 
 | |
|     convRGB32_2 r4, 1
 | |
|     convRGB32_2 r5, 1
 | |
|     convRGB32_2 r6, 1
 | |
|     convRGB32_2 r7, 1
 | |
| 
 | |
|     str     r4, [r8], #4
 | |
|     str     r5, [r8], #4
 | |
|     str     r6, [r8], #4
 | |
|     str     r7, [r8], #4
 | |
| 
 | |
|     bne     .loopM2RGB32_90
 | |
| 
 | |
|     subs    r10, r10, #1
 | |
|     ldmeqfd sp!, {r4-r10,pc}        @ return
 | |
| 
 | |
|     mov     r12, #208/4             @ restore row counter
 | |
|     mov     r0,  r8                 @ set new dst pointer
 | |
|     add     r8,  r0,  #208*4
 | |
|     add     r9,  r9,  #2*2          @ fix src pointer
 | |
|     mov     r1,  r9
 | |
|     b       .loopM2RGB32_90
 | |
| 
 | |
| 
 | |
| 
 | |
| @ converter for vidConvCpyM2_RGB32_270
 | |
| @ lr =  0x00F000F0, out: r3=lower_pix, r2=higher_pix; trashes rin
 | |
| .macro convRGB32_3 rin
 | |
|     and     r2,  lr, \rin, lsr #4 @ blue
 | |
|     and     r3,  \rin, lr
 | |
|     orr     r2,  r2,   r3, lsl #8         @ g0b0g0b0
 | |
| 
 | |
|     mov     r3,  r2,  lsl #16             @ g0b00000
 | |
|     and     \rin,lr,  \rin, ror #12       @ 00r000r0 (reversed)
 | |
|     orr     r3,  r3,  \rin, lsr #16       @ g0b000r0
 | |
| 
 | |
|     mov     r2,  r2,  lsr #16
 | |
|     orr     r2,  r2,  \rin, lsl #16
 | |
|     str     r2, [r0], #4
 | |
| 
 | |
|     mov     \rin,r3,  ror #16             @ r3=low
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .global vidConvCpyM2_RGB32_270 @ void *to, void *from, int width
 | |
| 
 | |
| vidConvCpyM2_RGB32_270:
 | |
|     stmfd   sp!, {r4-r10,lr}
 | |
| 
 | |
|     mov     lr, #0x00F00000
 | |
|     orr     lr, lr, #0x00F0
 | |
| 
 | |
|     mov     r12, #208/4            @ row counter
 | |
|     mov     r10, r2, lsl #2        @ we do 2 pixel wide copies (right to left)
 | |
| 
 | |
|     add     r8,  r0, #208*4        @ parallel line
 | |
|     add     r1,  r1, #326*2
 | |
|     mov     r9,  r1
 | |
| 
 | |
| .loopM2RGB32_270:
 | |
| 	subs    r12, r12, #1
 | |
| 
 | |
|     ldr     r4, [r1], #328*2
 | |
|     ldr     r5, [r1], #328*2
 | |
|     ldr     r6, [r1], #328*2
 | |
|     ldr     r7, [r1], #328*2
 | |
| 
 | |
|     convRGB32_3 r4
 | |
|     convRGB32_3 r5
 | |
|     convRGB32_3 r6
 | |
|     convRGB32_3 r7
 | |
| 
 | |
|     str     r4, [r8], #4
 | |
|     str     r5, [r8], #4
 | |
|     str     r6, [r8], #4
 | |
|     str     r7, [r8], #4
 | |
| 
 | |
|     bne     .loopM2RGB32_270
 | |
| 
 | |
|     subs    r10, r10, #1
 | |
|     ldmeqfd sp!, {r4-r10,pc}        @ return
 | |
| 
 | |
|     mov     r12, #208/4             @ restore row counter
 | |
|     mov     r0,  r8                 @ set new dst pointer
 | |
|     add     r8,  r0,  #208*4
 | |
|     sub     r9,  r9,  #2*2          @ fix src pointer
 | |
|     mov     r1,  r9
 | |
|     b       .loopM2RGB32_270
 | |
| 
 | 
