Wed Jan 14, 2015 4:12 am
jonnection wrote:I operate on bytes at a time using shifts and masks. This means that all my drawing routines draw bytes at a time ie. there is no putpixel. It also means that I have different routines for horizontal and vertical orientation and that clipping with screen boundaries is more complicated.
#include <SPI.h>
#include <Gamebuino.h>
Gamebuino gb;
const byte sprite[] PROGMEM = {
16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};
const byte swizzled_sprite[] PROGMEM = {
16, 16,
0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
};
void setup(){
gb.begin();
}
void loop(){
long start, finish;
int drawTime;
if(gb.update()){
const int count = 200;
start = millis();
for (int i=0; i<count; i++)
gb.display.drawBitmap(1, 31, sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("drawBitmap: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
start = millis();
for (int i=0; i<count; i++)
drawBitmapUnrolled(17, 31, sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("unrolled: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
start = millis();
for (int i=0; i<count; i++)
drawBitmapSwizzled(33, 31, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("swizzled: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
start = millis();
for (int i=0; i<count; i++)
drawBitmapSwizzledAligned(49, 32, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("aligned: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
}
}
void drawBitmapUnrolled(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2; //add an offset to the pointer to start after the width and height
int8_t i, j, byteWidth = (w + 7) >> 3;
uint8_t * screen_line = gb.display.getBuffer() + x;
const uint8_t * bitmap_line = bitmap;
uint8_t dsty = y;
uint8_t mask = _BV(y & 7);
for (j = 0; j < h; j++, dsty++, bitmap_line+=byteWidth, mask = (mask & 0x80) ? 1 : (mask<<1))
{
int ofs = (dsty >> 3) * LCDWIDTH_NOROT;
int8_t dstx = x;
const uint8_t * bitmap_src = bitmap_line;
uint8_t * ptr = screen_line + ofs;
int8_t i = w;
while (i >= 8)
{
uint8_t pixels = pgm_read_byte(bitmap_src++);
if (pixels & 0x80)
ptr[0] |= mask;
if (pixels & 0x40)
ptr[1] |= mask;
if (pixels & 0x20)
ptr[2] |= mask;
if (pixels & 0x10)
ptr[3] |= mask;
if (pixels & 0x08)
ptr[4] |= mask;
if (pixels & 0x04)
ptr[5] |= mask;
if (pixels & 0x02)
ptr[6] |= mask;
if (pixels & 0x01)
ptr[7] |= mask;
ptr += 8;
i -= 8;
dstx += 8;
}
if (i)
{
uint8_t pixels = pgm_read_byte(bitmap_src);
while (i--)
{
if (pixels & 0x80)
gb.display.drawPixel(dstx, dsty);
pixels <<= 1;
}
}
}
}
void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
unsigned scale1 = _BV((y & 7) + 8);
unsigned scale2 = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
{
uint8_t * dst = line;
for (int x=0; x<w; x++, dst++)
{
uint8_t pixels = pgm_read_byte(bitmap++);
*dst |= (pixels * scale1) >> 8;
*(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
}
}
}
void drawBitmapSwizzledAligned(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
unsigned scale1 = _BV((y & 7) + 8);
unsigned scale2 = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT-w)
for (int x=0; x<w; x+=2, line+=2, bitmap+=2)
*(unsigned *)line |= pgm_read_word(bitmap);
}
Wed Jan 14, 2015 9:01 am
Wed Jan 14, 2015 7:01 pm
jonnection wrote:155 ns ... well done !
Your code for the "swizzled" version is almost identical to the inner loop of mine.
The problem with the swizzled approach is that you need to write all graphics primitives to support it right from the beginning. Its not a case of replacing putpixel in a loop.
My take-away of this investigation so far:
Given that I easily get >20 alpha masked sprites on screen with my swizzled algorithm in "Isle of Maniax", I believe, that with your optimization of the putpixel routine as presented in this thread (the ~350 ns) version, the Gamebuino lib should be plenty fast enough for most games. Using putpixel in the inner loop makes many other things so much easier that the tradeoff vs. swizzled approach is maybe not worth rewriting the whole graphics system. But that's just my opinion.
I'm debating with myself whether I'll start hand optimizing the routines in asm but probably not. I'm happy with the performance now that the shifts have been eliminated, and I'm not willing to give up the portability of C++. I had never ever heard about how crappy shifting was on AVR before this thread came up. I'm surprised at how little mention it gets eg. on Arduino forums.
I also tried to find several approaches to improve the shifting performance (jumptables & nibble swap and mask) but came to the conclusion that multiplication is indeed faster no matter how you do it. Setting up the jump takes so many cycles that you can't beat the mul. The AVR has no hardware for shifting, end of story.
Graphics algorithms are just so much fun !
Wed Jan 14, 2015 7:52 pm
#include <SPI.h>
#include <Gamebuino.h>
Gamebuino gb;
const byte sprite[] PROGMEM = {
16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};
const byte swizzled_sprite[] PROGMEM = {
16, 16,
0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
};
void setup(){
Serial.begin(9600);
gb.begin();
}
void loop(){
long start, finish;
int drawTime;
if(gb.update()){
const int count = 200;
start = millis();
for (int i=0; i<count; i++)
drawBitmapSwizzled(33, 31, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
//gb.display.print(F("swizzled: "));
//gb.display.print(drawTime);
//gb.display.println(F("ns"));
Serial.print(F("Swizzled: "));
Serial.print(drawTime);
Serial.println(F(" ns"));
delay(1000);
start = millis();
for (int i=0; i<count; i++)
drawBitmapJSwizz(33, 31, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
//gb.display.print(F("aligned: "));
//gb.display.print(drawTime);
//gb.display.println(F("ns"));
Serial.print(F("JSwizz: "));
Serial.print(drawTime);
Serial.println(F(" ns"));
delay(1000);
}
}
void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
unsigned scale1 = _BV((y & 7) + 8);
unsigned scale2 = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
{
uint8_t * dst = line;
for (int x=0; x<w; x++, dst++)
{
uint8_t pixels = pgm_read_byte(bitmap++);
*dst |= (pixels * scale1) >> 8;
*(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
}
}
}
void drawBitmapJSwizz(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + (y >> 1) * 21 + x;
uint8_t mult = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
{
uint8_t * dst = line;
uint8_t x=w;
do {
uint16_t pixels = pgm_read_byte(bitmap++);
pixels *= mult;
*dst |= pixels;
*(dst+LCDWIDTH_NOROT) |= pixels >> 8;
dst++;
} while (--x);
}
}
Wed Jan 14, 2015 8:18 pm
jonnection wrote:115 ns, non-aligned swizzled.
Wed Jan 14, 2015 8:35 pm
Jamish wrote:Dang. If I may ask, what's swizzled? Is that rearranging the bytes/bits/ordering of the original sprite so that it's faster to operate on the bytes?
Wed Jan 14, 2015 9:16 pm
jonnection wrote:115 ns, non-aligned swizzled.
Wed Jan 14, 2015 9:29 pm
Jamish wrote:Dang. If I may ask, what's swizzled? Is that rearranging the bytes/bits/ordering of the original sprite so that it's faster to operate on the bytes?
Thu Jan 15, 2015 1:35 am
uint8_t * line = gb.display.getBuffer() + (y >> 1) * 21 + x;
#include <SPI.h>
#include <Gamebuino.h>
Gamebuino gb;
const byte sprite[] PROGMEM = {
16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};
const byte swizzled_sprite[] PROGMEM = {
16, 16,
0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
};
void setup(){
Serial.begin(9600);
gb.begin();
}
void loop(){
long start, finish;
int drawTime;
if(gb.update()){
const int count = 1000;
start = millis();
for (int i=0; i<count; i++)
drawBitmapJSwizz(0, 31, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("JSwizz: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
Serial.print(F("JSwizz: "));
Serial.print(drawTime);
Serial.println(F(" ns"));
start = millis();
for (int i=0; i<count; i++)
drawBitmapJSwizz2(16, 31, swizzled_sprite);
finish = millis();
drawTime = 1000L*(finish-start)/count;
gb.display.print(F("JSwizz2: "));
gb.display.print(drawTime);
gb.display.println(F("ns"));
Serial.print(F("JSwizz2: "));
Serial.print(drawTime);
Serial.println(F(" ns"));
delay(1000);
}
}
void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
unsigned scale1 = _BV((y & 7) + 8);
unsigned scale2 = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
{
uint8_t * dst = line;
for (int x=0; x<w; x++, dst++)
{
uint8_t pixels = pgm_read_byte(bitmap++);
*dst |= (pixels * scale1) >> 8;
*(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
}
}
}
void drawBitmapJSwizz(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + ((y >> 1) & 0xfc) * 21 + x;
uint8_t mult = _BV((y & 7));
for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
{
uint8_t * dst = line;
uint8_t x=w;
do {
uint16_t pixels = pgm_read_byte(bitmap++);
pixels *= mult;
*dst |= pixels;
*(dst+LCDWIDTH_NOROT) |= pixels >> 8;
dst++;
} while (--x);
}
}
void drawBitmapJSwizz2(int8_t x, int8_t y, const uint8_t *bitmap) {
int8_t w = pgm_read_byte(bitmap);
int8_t h = pgm_read_byte(bitmap + 1);
bitmap = bitmap + 2;
uint8_t * line = gb.display.getBuffer() + ((y >> 1) & 0xfc) * 21 + x;
uint8_t mult = _BV((y & 7));
uint8_t l = h >> 3;
do
{
uint8_t * dst = line;
line += w;
do
{
uint16_t pixels = pgm_read_byte(bitmap++) * mult;
*dst |= pixels;
*(dst + LCDWIDTH_NOROT) |= pixels >> 8;
} while ((uint8_t)(uint16_t)(++dst) != (uint8_t)(uint16_t)line);
line += LCDWIDTH_NOROT - w;
} while (--l);
}
Thu Jan 15, 2015 2:43 am