Mon Aug 10, 2015 12:31 am
C:/arduino/hardware/tools/avr/bin/avr-objdump -h -S C:\Users\YOUR_USER_NAME_\AppData\Local\Temp\build12345678_or_something.tmp/yourapp.cpp.elf > yourapp.lst
noInterrupts();
ultraDraw3(face, x, y);
interrupts();
3be: f8 94 cli
3c0: 4d 2f mov r20, r29
3c2: 6c 2f mov r22, r28
3c4: 8a e0 ldi r24, 0x0A ; 10
3c6: 91 e0 ldi r25, 0x01 ; 1
3c8: 0e 94 93 01 call 0x326 ; 0x326 <_Z10ultraDraw3Phcc>
3cc: 78 94 sei
394: 8d 91 ld r24, X+ ; load data[i] into r24
396: 90 e0 ldi r25, 0x00 ; signed-extend to 16-bits
398: 02 2e mov r0, r18 ; load r0 with (8-y%8)
39a: 02 c0 rjmp .+4 ; skip next instruction (WHY??? pseudo-optimization perhaps?)
39c: 95 95 asr r25 ; shift r25:r24 right by one bit
39e: 87 95 ror r24
3a0: 0a 94 dec r0 ; we done with the loop yet
3a2: e2 f7 brpl .-8 ; if not then keep going
const static byte rol_lut[] = {1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7}; // store this somewhere globally
....
buf[0] |= *data * rol_lut[y & 7]; // do this once per pixel
const static unsigned ror_lut[] = {0x100>>0, 0x100>>1, 0x100>>2, 0x100>>3, 0x100>>4, 0x100>>5, 0x100>>6, 0x100>>7};
....
buf[84+0] |= (data[0] * ror_lut[y & 7]) >> 8; // do this once per pixel
void ultraDraw5(register byte * data, const char x, register char y) {
if(y>=0) {
register uint8_t* buf = ((y&0xF8)>>1) * 21 + gb.display.getBuffer() + x;
y &= 7;
register uint8_t scale1 = ror_lut[y];
register uint16_t scale2 = ror_lut[y];
buf[0] |= data[0] * scale1;
buf[84+0] |= (data[0] * scale2) >> 8;
buf[1] |= data[1] * scale1;
buf[84+1] |= (data[1] * scale2) >> 8;
buf[2] |= data[2] * scale1;
buf[84+2] |= (data[2] * scale2) >> 8;
buf[3] |= data[3] * scale1;
buf[84+3] |= (data[3] * scale2) >> 8;
buf[4] |= data[4] * scale1;
buf[84+4] |= (data[4] * scale2) >> 8;
buf[5] |= data[5] * scale1;
buf[84+5] |= (data[5] * scale2) >> 8;
buf[6] |= data[6] * scale1;
buf[84+6] |= (data[6] * scale2) >> 8;
buf[7] |= data[7] * scale1;
buf[84+7] |= (data[7] * scale2) >> 8;
}
}
Mon Aug 10, 2015 8:18 am
void ultraDraw4(byte data[], char x, char y){
uint8_t* buf = (((y+8)&0xF8)>>1) * 21 + x + gb.display.getBuffer();
asm volatile(
"mov R20,%[y]\n\t"
"ldi R17,7\n\t"
"add R20,R17\n\t"
"brmi End\n\t"
"cpi %[y],48\n\t"
"brpl End\n\t"
"inc R20\n\t"
"ldi R16,8\n\t"
"andi R20,7\n\t"
"cpi R20,0\n\t"
"breq LoopAligned\n"
"LoopStart:\n\t"
"tst %[x]\n\t"
"brmi LoopSkip\n\t"
"cpi %[x],84\n\t"
"brcc LoopSkip\n\t"
"ld R17,Z\n\t"
"eor R18,R18\n\t"
"mov R19,R20\n\t"
"clc\n\t"
"LoopShift:\n\t" // carry is still reset from the cpi instruction or from the dec
"rol R17\n\t"
"rol R18\n\t"
"dec R19\n\t"
"brne LoopShift\n\t"
"tst %[y]\n\t"
"brmi LoopSkipPart\n\t"
"ld R19,X\n\t"
"eor R19,R17\n\t"
"st X,R19\n\t"
"LoopSkipPart:\n\t"
"cpi %[y],40\n\t"
"brpl LoopSkip\n\t"
"ld R19,Y\n\t"
"eor R19,R18\n\t"
"st Y,R19\n\t"
"LoopSkip:\n\t"
"eor R18,R18\n\t"
"ldi R19,1\n\t"
"add R26,R19\n\t" // INC DOESN'T CHANGE CARRY!
"adc R27,R18\n\t"
"add R28,R19\n\t"
"adc R29,R18\n\t"
"add R30,R19\n\t"
"adc R31,R18\n\t"
"inc %[x]\n\t"
"dec R16\n\t"
"brne LoopStart\n\t"
"rjmp End\n"
"LoopAligned:\n\t"
"tst %[x]\n\t"
"brmi LoopAlignSkip\n\t"
"cpi %[x],84\n\t"
"brcc LoopAlignSkip\n\t"
"ld R17,Z\n\t"
"ld R18,X\n\t"
"eor R18,R17\n\t"
"st X,R18\n\t"
"LoopAlignSkip:\n\t"
"ldi R18,1\n\t"
"add R26,R18\n\t"
"adc R27,R20\n\t"
"add R30,R18\n\t"
"adc R31,R20\n\t"
"inc %[x]\n\t"
"dec R16\n\t"
"brne LoopAligned\n"
"End:\n\t"
::"x" (buf - 84),"y" (buf),"z" (data),[y] "r" (y),[x] "r" (x):"r16","r17","r18","r19","r20");
}
Tue Aug 11, 2015 12:39 am
Sorunome wrote:Wouldn't it be better for you to do data++ though, else it has to add always 1 to it, then 2 etc. always twice so that if you do data++ and then &(data)
3be: 23 2f mov r18, r19 ; previous *buf is in r19, "save" it to r18
3c0: 33 27 eor r19, r19 ; clear r19...umm...for what purpose, exactly?
3c2: 24 2b or r18, r20 ; "or" *data pixels with *buf pixels
3c4: 28 83 st Y, r18 ; save result back into *buf
3c6: 12 96 adiw r26, 0x02 ; advance data ptr
3c8: 3c 91 ld r19, X ; load next *buf pixel into r19. (wtf?)
Tue Aug 11, 2015 5:13 am
Tue Aug 11, 2015 8:44 am
Myndale wrote:Sorunome wrote:Wouldn't it be better for you to do data++ though, else it has to add always 1 to it, then 2 etc. always twice so that if you do data++ and then &(data)
You'd think so, wouldn't you? But this is GCC/AVR, which is a poor compiler at the best of times. In practice it generates virtually identical code in both cases except for this little gem:
- Code:
3be: 23 2f mov r18, r19 ; previous *buf is in r19, "save" it to r18
3c0: 33 27 eor r19, r19 ; clear r19...umm...for what purpose, exactly?
3c2: 24 2b or r18, r20 ; "or" *data pixels with *buf pixels
3c4: 28 83 st Y, r18 ; save result back into *buf
3c6: 12 96 adiw r26, 0x02 ; advance data ptr
3c8: 3c 91 ld r19, X ; load next *buf pixel into r19. (wtf?)
This looks very much to me like a bug in the optimizer. For some reason the compiler thinks it needs to clear r19 (which wastes 1 cycle) so it transfers its value into r18 (which wastes another). Net effect is a total of 16 wasted cycles. You'll also notice that it advances data (r26) by 2 bytes instead of 1, if you look at the code you'll see it actually modifies the value of data in your original version a total of 4 times: twice by +2, once by -2 and once more by -1 for a net total of +1 all up. And then it compensates for the ptr being out by 2 bytes by using the offset forms of LDD which are 3 cycles instead of 1. Madness!
Wed Dec 23, 2015 4:22 pm
const byte tiles[]={
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x04,0x20,0x00,0x00,0x08,0x00,
0x40,0x82,0x14,0x20,0x00,0x11,0x22,0x00,
0x00,0x00,0x00,0x00,0x20,0x12,0x54,0x00,
0x00,0x00,0x00,0x00,0x10,0x12,0x54,0x00,
0xff,0xfd,0xdf,0xaf,0xfb,0xf5,0xff,0xdf,
0xfd,0xdf,0xaf,0xfb,0xf5,0xff,0xdf,0xff,
0x18,0x24,0x4a,0x67,0xdb,0xbf,0xdf,0x7e,
0x7e,0x81,0xa9,0xb5,0x81,0x7e,0x18,0x18,
0x03,0x1c,0x28,0x20,0x54,0x48,0x60,0x80,
0xc1,0xb0,0x6a,0x57,0x3b,0x1e,0x04,0x03,
0xc0,0x38,0x14,0x04,0x0a,0xa2,0x16,0xab,
0x55,0x29,0x56,0xbe,0xec,0x78,0x20,0xc0 };
[...]
ultraDraw4(tiles[tile_num*TILE_HEIGHT], x, y);
[...]
Mon Dec 28, 2015 3:08 am
Mon Dec 28, 2015 8:35 am
Mon Dec 28, 2015 2:23 pm
Sat Jan 02, 2016 12:36 pm
Assembler messages:
Error: register number above 15 required
Error: symbol `LoopStart' is already defined
Error: register number above 15 required
Error: symbol `LoopShift' is already defined
Error: symbol `LoopSkipPart' is already defined
Error: register number above 15 required
Error: symbol `LoopSkip' is already defined
Error: symbol `LoopAligned' is already defined
Error: register number above 15 required
Error: symbol `LoopAlignSkip' is already defined
Error: symbol `End' is already defined
Error: register number above 15 required
Error: symbol `LoopStart' is already defined
Error: register number above 15 required
Error: symbol `LoopShift' is already defined
Error: symbol `LoopSkipPart' is already defined
Error: register number above 15 required
Error: symbol `LoopSkip' is already defined
Error: symbol `LoopAligned' is already defined
Error: register number above 15 required
Error: symbol `LoopAlignSkip' is already defined
Error: symbol `End' is already defined
void draw_sprite(const byte data[], char x, char y){ // routine by Sorunome
uint8_t* buf = (((y+8)&0xF8)>>1) * 21 + x + gb.display.getBuffer();
asm volatile(
"mov R20,%[y]\n\t"
"ldi R17,7\n\t"
"add R20,R17\n\t"
"brmi 6f\n\t"
"cpi %[y],48\n\t"
"brpl 6f\n\t"
"inc R20\n\t"
"ldi R16,8\n\t"
"andi R20,7\n\t"
"cpi R20,0\n\t"
"breq 4f\n"
"0:\n\t"
"tst %[x]\n\t"
"brmi 3f\n\t"
"cpi %[x],84\n\t"
"brcc 3f\n\t"
"ld R17,Z\n\t"
"eor R18,R18\n\t"
"mov R19,R20\n\t"
"clc\n\t"
"1:\n\t" // carry is still reset from the cpi instruction or from the dec
"rol R17\n\t"
"rol R18\n\t"
"dec R19\n\t"
"brne 1b\n\t"
"tst %[y]\n\t"
"brmi 2f\n\t"
"ld R19,X\n\t"
"or R19,R17\n\t"
"st X,R19\n\t"
"2:\n\t"
"cpi %[y],48\n\t"
"brpl 3f\n\t"
"ld R19,Y\n\t"
"or R19,R18\n\t"
"st Y,R19\n\t"
"3:\n\t"
"eor R18,R18\n\t"
"ldi R19,1\n\t"
"add R26,R19\n\t" // INC DOESN'T CHANGE CARRY!
"adc R27,R18\n\t"
"add R28,R19\n\t"
"adc R29,R18\n\t"
"add R30,R19\n\t"
"adc R31,R18\n\t"
"inc %[x]\n\t"
"dec R16\n\t"
"brne 0b\n\t"
"rjmp 6f\n"
"4:\n\t"
"tst %[x]\n\t"
"brmi 5f\n\t"
"cpi %[x],84\n\t"
"brcc 5f\n\t"
"ld R17,Z\n\t"
"ld R18,X\n\t"
"eor R18,R17\n\t"
"st X,R18\n\t"
"5:\n\t"
"ldi R18,1\n\t"
"add R26,R18\n\t"
"adc R27,R20\n\t"
"add R30,R18\n\t"
"adc R31,R20\n\t"
"inc %[x]\n\t"
"dec R16\n\t"
"brne 4b\n"
"6:\n\t"
::"x" (buf - 84),"y" (buf),"z" (data),[y] "r" (y),[x] "r" (x):"r16","r17","r18","r19","r20");
}