Display functions optimization

Libraries, utilities, bootloaders...

Re: Display functions optimization

Postby Myndale » Wed Jan 14, 2015 4:12 am

jonnection wrote:I operate on bytes at a time using shifts and masks. This means that all my drawing routines draw bytes at a time ie. there is no putpixel. It also means that I have different routines for horizontal and vertical orientation and that clipping with screen boundaries is more complicated.


That sounds similar to what I was discussing regarding storing bitmaps in the same format as the LCD buffer. Doing that in my little demo gets drawBitmap down to ~155ns (~14 times faster than the library version), aligning y to a multiple of 8 gets it to 50ns (43x faster). And that's without any loop unrolling or hand-tweaked assembly:

Code: Select all
#include <SPI.h>
#include <Gamebuino.h>
Gamebuino gb;

const byte sprite[] PROGMEM = {
  16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};
 
  const byte swizzled_sprite[] PROGMEM = {
  16, 16,
  0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
  0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
};


void setup(){
  gb.begin();
}

void loop(){
  long start, finish;
  int drawTime;

  if(gb.update()){
    const int count = 200;

     start = millis();
     for (int i=0; i<count; i++)
     gb.display.drawBitmap(1, 31, sprite);
     finish = millis();
     drawTime = 1000L*(finish-start)/count;
     gb.display.print(F("drawBitmap: "));
     gb.display.print(drawTime);
     gb.display.println(F("ns"));

    start = millis();
    for (int i=0; i<count; i++)
      drawBitmapUnrolled(17, 31, sprite);
    finish = millis();
    drawTime = 1000L*(finish-start)/count;
    gb.display.print(F("unrolled: "));
    gb.display.print(drawTime);
    gb.display.println(F("ns"));
   
    start = millis();
    for (int i=0; i<count; i++)
      drawBitmapSwizzled(33, 31, swizzled_sprite);
    finish = millis();
    drawTime = 1000L*(finish-start)/count;
    gb.display.print(F("swizzled: "));
    gb.display.print(drawTime);
    gb.display.println(F("ns"));
   
    start = millis();
    for (int i=0; i<count; i++)
      drawBitmapSwizzledAligned(49, 32, swizzled_sprite);
    finish = millis();
    drawTime = 1000L*(finish-start)/count;
    gb.display.print(F("aligned: "));
    gb.display.print(drawTime);
    gb.display.println(F("ns"));
  }
}

void drawBitmapUnrolled(int8_t x, int8_t y, const uint8_t *bitmap) {

  int8_t w = pgm_read_byte(bitmap);
  int8_t h = pgm_read_byte(bitmap + 1);
  bitmap = bitmap + 2; //add an offset to the pointer to start after the width and height
  int8_t i, j, byteWidth = (w + 7) >> 3;
  uint8_t * screen_line = gb.display.getBuffer() + x;
  const uint8_t * bitmap_line = bitmap;

  uint8_t dsty = y;
  uint8_t mask = _BV(y & 7);
  for (j = 0; j < h; j++, dsty++, bitmap_line+=byteWidth, mask = (mask & 0x80) ? 1 : (mask<<1))
  {
    int ofs = (dsty >> 3) * LCDWIDTH_NOROT;
    int8_t dstx = x;
    const uint8_t * bitmap_src = bitmap_line;
    uint8_t * ptr = screen_line + ofs;   
    int8_t i = w;
    while (i >= 8)
    {
      uint8_t pixels = pgm_read_byte(bitmap_src++);
      if (pixels & 0x80)
        ptr[0] |= mask;
      if (pixels & 0x40)
        ptr[1] |= mask;
      if (pixels & 0x20)
        ptr[2] |= mask;
      if (pixels & 0x10)
        ptr[3] |= mask;
      if (pixels & 0x08)
        ptr[4] |= mask;
      if (pixels & 0x04)
        ptr[5] |= mask;
      if (pixels & 0x02)
        ptr[6] |= mask;
      if (pixels & 0x01)
        ptr[7] |= mask;
       
      ptr += 8;
      i -= 8;
      dstx += 8;     
    }
    if (i)
    {
       uint8_t pixels = pgm_read_byte(bitmap_src);
       while (i--)
       {
         if (pixels & 0x80)
         gb.display.drawPixel(dstx, dsty);
         pixels <<= 1;
       }
     }     
  }

}

void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {

  int8_t w = pgm_read_byte(bitmap);
  int8_t h = pgm_read_byte(bitmap + 1);
  bitmap = bitmap + 2;
  uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
  unsigned scale1 = _BV((y & 7) + 8);
  unsigned scale2 = _BV((y & 7));
  for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
  {
    uint8_t * dst = line;
    for (int x=0; x<w; x++, dst++)
    {
      uint8_t pixels = pgm_read_byte(bitmap++);
      *dst |= (pixels * scale1) >> 8;
      *(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
    }
  }
}

void drawBitmapSwizzledAligned(int8_t x, int8_t y, const uint8_t *bitmap) {

  int8_t w = pgm_read_byte(bitmap);
  int8_t h = pgm_read_byte(bitmap + 1);
  bitmap = bitmap + 2;
  uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
  unsigned scale1 = _BV((y & 7) + 8);
  unsigned scale2 = _BV((y & 7));
  for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT-w)
    for (int x=0; x<w; x+=2, line+=2, bitmap+=2)
      *(unsigned *)line |= pgm_read_word(bitmap);
}
Myndale
 
Posts: 507
Joined: Sat Mar 01, 2014 1:25 am

Re: Display functions optimization

Postby jonnection » Wed Jan 14, 2015 9:01 am

155 ns ... well done !

Your code for the "swizzled" version is almost identical to the inner loop of mine.

The problem with the swizzled approach is that you need to write all graphics primitives to support it right from the beginning. Its not a case of replacing putpixel in a loop.

My take-away of this investigation so far:

Given that I easily get >20 alpha masked sprites on screen with my swizzled algorithm in "Isle of Maniax", I believe, that with your optimization of the putpixel routine as presented in this thread (the ~350 ns) version, the Gamebuino lib should be plenty fast enough for most games. Using putpixel in the inner loop makes many other things so much easier that the tradeoff vs. swizzled approach is maybe not worth rewriting the whole graphics system. But that's just my opinion.

I'm debating with myself whether I'll start hand optimizing the routines in asm but probably not. I'm happy with the performance now that the shifts have been eliminated, and I'm not willing to give up the portability of C++. I had never ever heard about how crappy shifting was on AVR before this thread came up. I'm surprised at how little mention it gets eg. on Arduino forums.

I also tried to find several approaches to improve the shifting performance (jumptables & nibble swap and mask) but came to the conclusion that multiplication is indeed faster no matter how you do it. Setting up the jump takes so many cycles that you can't beat the mul. The AVR has no hardware for shifting, end of story.

Graphics algorithms are just so much fun !
User avatar
jonnection
 
Posts: 317
Joined: Sun May 04, 2014 8:21 pm

Re: Display functions optimization

Postby DFX2KX » Wed Jan 14, 2015 7:01 pm

jonnection wrote:155 ns ... well done !

Your code for the "swizzled" version is almost identical to the inner loop of mine.

The problem with the swizzled approach is that you need to write all graphics primitives to support it right from the beginning. Its not a case of replacing putpixel in a loop.

My take-away of this investigation so far:

Given that I easily get >20 alpha masked sprites on screen with my swizzled algorithm in "Isle of Maniax", I believe, that with your optimization of the putpixel routine as presented in this thread (the ~350 ns) version, the Gamebuino lib should be plenty fast enough for most games. Using putpixel in the inner loop makes many other things so much easier that the tradeoff vs. swizzled approach is maybe not worth rewriting the whole graphics system. But that's just my opinion.

I'm debating with myself whether I'll start hand optimizing the routines in asm but probably not. I'm happy with the performance now that the shifts have been eliminated, and I'm not willing to give up the portability of C++. I had never ever heard about how crappy shifting was on AVR before this thread came up. I'm surprised at how little mention it gets eg. on Arduino forums.

I also tried to find several approaches to improve the shifting performance (jumptables & nibble swap and mask) but came to the conclusion that multiplication is indeed faster no matter how you do it. Setting up the jump takes so many cycles that you can't beat the mul. The AVR has no hardware for shifting, end of story.

Graphics algorithms are just so much fun !


I'm just impressed at how much you can cram onto the GB's screen without slowing down as it is... I don't think any of my games (sans perhaps the tank game demo) would even be noticeably faster, the Gamebuino already outruns me.
DFX2KX
 
Posts: 250
Joined: Mon Apr 14, 2014 3:48 am

Re: Display functions optimization

Postby jonnection » Wed Jan 14, 2015 7:52 pm

115 ns, non-aligned swizzled. :lol:

Image

Code: Select all
    #include <SPI.h>
    #include <Gamebuino.h>
    Gamebuino gb;

    const byte sprite[] PROGMEM = {
      16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};
     
      const byte swizzled_sprite[] PROGMEM = {
      16, 16,
      0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
      0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
    };


    void setup(){
      Serial.begin(9600);
      gb.begin();
    }

    void loop(){
      long start, finish;
      int drawTime;

      if(gb.update()){
        const int count = 200;

       
        start = millis();
        for (int i=0; i<count; i++)
          drawBitmapSwizzled(33, 31, swizzled_sprite);
        finish = millis();
        drawTime = 1000L*(finish-start)/count;
        //gb.display.print(F("swizzled: "));
        //gb.display.print(drawTime);
        //gb.display.println(F("ns"));
        Serial.print(F("Swizzled: "));
        Serial.print(drawTime);
        Serial.println(F(" ns"));
       
        delay(1000);
       
        start = millis();
        for (int i=0; i<count; i++)
          drawBitmapJSwizz(33, 31, swizzled_sprite);
        finish = millis();
        drawTime = 1000L*(finish-start)/count;
        //gb.display.print(F("aligned: "));
        //gb.display.print(drawTime);
        //gb.display.println(F("ns"));
        Serial.print(F("JSwizz: "));
        Serial.print(drawTime);
        Serial.println(F(" ns"));
       
        delay(1000);
      }
    }

    void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {

      int8_t w = pgm_read_byte(bitmap);
      int8_t h = pgm_read_byte(bitmap + 1);
      bitmap = bitmap + 2;
      uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
      unsigned scale1 = _BV((y & 7) + 8);
      unsigned scale2 = _BV((y & 7));
      for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
      {
        uint8_t * dst = line;
        for (int x=0; x<w; x++, dst++)
        {
          uint8_t pixels = pgm_read_byte(bitmap++);
          *dst |= (pixels * scale1) >> 8;
          *(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
        }
      }
    }

    void drawBitmapJSwizz(int8_t x, int8_t y, const uint8_t *bitmap) {

      int8_t w = pgm_read_byte(bitmap);
      int8_t h = pgm_read_byte(bitmap + 1);
      bitmap = bitmap + 2;
      uint8_t * line = gb.display.getBuffer() + (y >> 1) * 21 + x;
      uint8_t mult = _BV((y & 7));
      for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
      {
        uint8_t * dst = line;
        uint8_t x=w;
        do {
          uint16_t pixels = pgm_read_byte(bitmap++);
          pixels *= mult;
          *dst |= pixels;
          *(dst+LCDWIDTH_NOROT) |= pixels >> 8;
          dst++;
        } while (--x);
      }
    }

User avatar
jonnection
 
Posts: 317
Joined: Sun May 04, 2014 8:21 pm

Re: Display functions optimization

Postby Jamish » Wed Jan 14, 2015 8:18 pm

jonnection wrote:115 ns, non-aligned swizzled. :lol:


Image

Dang. If I may ask, what's swizzled? Is that rearranging the bytes/bits/ordering of the original sprite so that it's faster to operate on the bytes?
User avatar
Jamish
 
Posts: 73
Joined: Wed Dec 17, 2014 6:52 pm
Location: California

Re: Display functions optimization

Postby jonnection » Wed Jan 14, 2015 8:35 pm

Jamish wrote:Dang. If I may ask, what's swizzled? Is that rearranging the bytes/bits/ordering of the original sprite so that it's faster to operate on the bytes?


I don't know. Myndale called it swizzling and since he is a graphics guru, I thought its something important.

How this works is that, in case the bitmap doesn't fall on a byte boundary in the pixel memory (i.e. y is not 0,8,16 etc) you "split" the bitmap byte in two pieces. One piece you draw (OR) with the byte ( 8 pixels ) on the screen, and the other piece you OR with the byte below (8 pixels). In this way you do not need to check individual bits because you OR with complete bytes.
User avatar
jonnection
 
Posts: 317
Joined: Sun May 04, 2014 8:21 pm

Re: Display functions optimization

Postby Myndale » Wed Jan 14, 2015 9:16 pm

jonnection wrote:115 ns, non-aligned swizzled. :lol:


Wow! Impressive. :o

I gotta go have a look at the assembly to this now....

(EDIT: Rodot we seem to have lost the "Surprised" smiley....)
Myndale
 
Posts: 507
Joined: Sat Mar 01, 2014 1:25 am

Re: Display functions optimization

Postby Myndale » Wed Jan 14, 2015 9:29 pm

Jamish wrote:Dang. If I may ask, what's swizzled? Is that rearranging the bytes/bits/ordering of the original sprite so that it's faster to operate on the bytes?


Yep, that's it exactly. It's used a lot in computer graphics, for example textures on consoles like the XBOX or PSP etc have a swizzled/fast format for faster rendering of triangles. The technical reasoning behind it is that textures can only be sampled from fast cache memory, which there isn't a whole lot of due to the high cost. If a texture needs to be sampled and the cache line that the texel resides in isn't already sitting in the cache then the whole graphics pipeline stalls while that line is fetched from video RAM. To get good performance you therefore need to ensure that subsequent texel fetches are likely to be close to each other in order to minimize the number of cache misses. Swizzled textures are laid out differently in order to facilitate this.

In a very crude way that's sort of what's happening here. Each byte in the screen buffer is a tiny "cache line" of 8 bits which must be loaded into a "cache slot" (register), modified and then written back out again. By storing the bitmaps in the same format as the screen buffer we can write multiple pixels at a time and minimize the number of instructions and amount of traffic we need to transfer across the bus.
Myndale
 
Posts: 507
Joined: Sat Mar 01, 2014 1:25 am

Re: Display functions optimization

Postby Myndale » Thu Jan 15, 2015 1:35 am

82ns :D

Image

There was a minor bug in this line that was causing the render address to be wrong..

Code: Select all
uint8_t * line = gb.display.getBuffer() + (y >> 1) * 21 + x;


...but the fix doesn't affect the speed any (note how both sprites were being rendered to the same coordinate but appeared in different places on screen). Here's my latest effort:

Code: Select all
#include <SPI.h>
#include <Gamebuino.h>
Gamebuino gb;

const byte sprite[] PROGMEM = {
  16, 16, 0x1f,0xf8,0x1f,0xf8,0x1f,0xfc,0x1f,0xff,0x1f,0xff,0xf,0xff,0xf,0xff,0x7,0xff,0x87,0xff,0x3,0xff,0x1,0xff,0x0,0x7f,0x2,0x1f,0x0,0x0,0x0,0x0,0x40,0x0,};

const byte swizzled_sprite[] PROGMEM = {
  16, 16,
  0x00,0x00,0x00,0x1f,0x7f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xfc,0xf8,0xf8,
  0x01,0x80,0x00,0x00,0x00,0x01,0x13,0x07,0x07,0x0f,0x0f,0x1f,0x1f,0x1f,0x1f,0x1f
};


void setup(){
  Serial.begin(9600);
  gb.begin();
}

void loop(){
  long start, finish;
  int drawTime;

  if(gb.update()){
    const int count = 1000;


    start = millis();
    for (int i=0; i<count; i++)
      drawBitmapJSwizz(0, 31, swizzled_sprite);
    finish = millis();
    drawTime = 1000L*(finish-start)/count;
    gb.display.print(F("JSwizz: "));
    gb.display.print(drawTime);
    gb.display.println(F("ns"));
    Serial.print(F("JSwizz: "));
    Serial.print(drawTime);
    Serial.println(F(" ns"));
   
    start = millis();
    for (int i=0; i<count; i++)
      drawBitmapJSwizz2(16, 31, swizzled_sprite);
    finish = millis();
    drawTime = 1000L*(finish-start)/count;
    gb.display.print(F("JSwizz2: "));
    gb.display.print(drawTime);
    gb.display.println(F("ns"));
    Serial.print(F("JSwizz2: "));
    Serial.print(drawTime);
    Serial.println(F(" ns"));

    delay(1000);
  }
}

void drawBitmapSwizzled(int8_t x, int8_t y, const uint8_t *bitmap) {

  int8_t w = pgm_read_byte(bitmap);
  int8_t h = pgm_read_byte(bitmap + 1);
  bitmap = bitmap + 2;
  uint8_t * line = gb.display.getBuffer() + (y >> 3) * LCDWIDTH_NOROT + x;
  unsigned scale1 = _BV((y & 7) + 8);
  unsigned scale2 = _BV((y & 7));
  for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
  {
    uint8_t * dst = line;
    for (int x=0; x<w; x++, dst++)
    {
      uint8_t pixels = pgm_read_byte(bitmap++);
      *dst |= (pixels * scale1) >> 8;
      *(dst+LCDWIDTH_NOROT) |= (pixels * scale2) >> 8;
    }
  }
}

void drawBitmapJSwizz(int8_t x, int8_t y, const uint8_t *bitmap) {

      int8_t w = pgm_read_byte(bitmap);
      int8_t h = pgm_read_byte(bitmap + 1);
      bitmap = bitmap + 2;
      uint8_t * line = gb.display.getBuffer() + ((y >> 1) & 0xfc) * 21 + x;
      uint8_t mult = _BV((y & 7));
      for (int y=0; y<h; y+=8, line+=LCDWIDTH_NOROT)
      {
        uint8_t * dst = line;
        uint8_t x=w;
        do {
          uint16_t pixels = pgm_read_byte(bitmap++);
          pixels *= mult;
          *dst |= pixels;
          *(dst+LCDWIDTH_NOROT) |= pixels >> 8;
          dst++;
        } while (--x);
      }
    }

void drawBitmapJSwizz2(int8_t x, int8_t y, const uint8_t *bitmap) {

  int8_t w = pgm_read_byte(bitmap);
  int8_t h = pgm_read_byte(bitmap + 1);
  bitmap = bitmap + 2;
  uint8_t * line = gb.display.getBuffer() + ((y >> 1) & 0xfc) * 21 + x; 
  uint8_t mult = _BV((y & 7));
  uint8_t l = h >> 3;
  do
  {
    uint8_t * dst = line;
    line += w;
    do
    {
      uint16_t pixels = pgm_read_byte(bitmap++) * mult;
      *dst |= pixels;
      *(dst + LCDWIDTH_NOROT) |= pixels >> 8;
    } while ((uint8_t)(uint16_t)(++dst) != (uint8_t)(uint16_t)line);
    line += LCDWIDTH_NOROT - w;
  } while (--l);
}


The fact that some simple rearranging of code can result in a 35-40% speed increase suggests we're probably close to the limit of what can be achieved using C, it's also indicative of just how crap the AVR gcc optimizer is.
Myndale
 
Posts: 507
Joined: Sat Mar 01, 2014 1:25 am

Re: Display functions optimization

Postby NicK » Thu Jan 15, 2015 2:43 am

So lets get some assembler language routines going on the gamebuino - get to the nuts and bolts.......would be great education.
NicK
 
Posts: 27
Joined: Tue Dec 23, 2014 11:25 pm

PreviousNext

Return to Software Development

Who is online

Users browsing this forum: No registered users and 6 guests

cron