tags:

views:

139

answers:

2

I'm trying to do some simple image processing using opengl. Since I couldn't find any good library that does this alrdy I've been trying to do my own solution.

I simply want to compose a few images on the gpu and then read them back. However the performance of my implementation seems almost equal to what it takes do on the cpu... something is wrong...

I've tried to follow the best practices I've found on the net. But still it's doing something wrong.

I've tried removing all the irrelevant code.

Any ideas as to why this implementation has poor performance?

int image_width = 1280;
int image_height = 720;
int image_size = image_width * image_height;

class texture
{
public:
    texture()
    {   
        glGenTextures(1, &texture_);    
        bind();
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
        glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
    }

    ~texture(){ glDeleteTextures(1, &texture_); }
    void bind(){ glBindTexture(GL_TEXTURE_2D, texture_); }
    GLuint handle() { return texture_; }

private:
    GLuint texture_;
};
typedef std::shared_ptr<texture> texture_ptr;

class pixel_buffer // pixel buffer with associated texture
{
public:
    pixel_buffer()
    {
        glGenBuffersARB(1, &pbo_);
        bind_pbo();
        glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
    }

    ~pixel_buffer(){ glDeleteBuffers(1, &pbo_); }

    void begin_write(void* src)
    {
        texture_.bind();
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
        glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
        void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
        assert(ptr); 
        memcpy(ptr, src, image_size);
        glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
    }

    void end_write()
    {
        bind_texture();
        bind_pbo();
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
        unbind_pbo();
    }

    void begin_read(GLuint buffer)
    {
        glReadBuffer(buffer);
        glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_);
        glBufferData(GL_PIXEL_PACK_BUFFER_ARB, image_size, NULL, GL_STREAM_READ);
        glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
    }

    void end_read(void* dest)
    {
        void* ptr = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);   
        memcpy(dest, ptr, image_size);
        glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
        unbind_pbo();
    }

    void bind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
    void unbind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
    void bind_texture() { texture_.bind(); }

    GLuint texture_handle() { return texture_.handle(); }

private:
    texture texture_;
    GLuint pbo_;
};
typedef std::shared_ptr<pixel_buffer> pixel_buffer_ptr;

class frame_buffer// frame buffer with associated pixel buffer
{
public:
    frame_buffer()
    {
        glGenFramebuffersEXT(1, &fbo_);

        bind();
        pbo_.bind_texture();
        glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, pbo_.texture_handle(), 0);
    }

    ~frame_buffer() { glDeleteFramebuffersEXT(1, &fbo_); }

    void bind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_); }
    void unbind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);    }

    void begin_read()
    {
        bind();
        pbo_.begin_read(GL_COLOR_ATTACHMENT0_EXT);
    }

    void end_read(void* dest)
    {
        pbo_.end_read(dest);
        unbind();
    }

private:
    pixel_buffer pbo_;
    GLuint fbo_;
};
typedef std::shared_ptr<frame_buffer> frame_buffer_ptr;

struct image_processor::implementation
{   
    void compose(const std::vector<image_ptr>& images)
    {                   
        // END PREVIOUS READ
        if(reading_fbo_)
        {
            image_ptr result_image = std::make_shared<image>(image_size);
            reading_fbo_->end_read(result_image->data());
            output_.push(reading_result_image_);

            reading_fbo_ = nullptr;
        }

        // END PREVIOUS WRITE
        frame_buffer_ptr written_fbo;
        if(!writing_pbo_group_.empty())
        {
            // END
            written_fbo = get_fbo();
            written_fbo->bind();
            glClear(GL_COLOR_BUFFER_BIT);   

            for(size_t n = 0; n < writing_pbo_group_.size(); ++n)
            {
                writing_pbo_group_[n]->end_write();
                writing_pbo_group_[n]->bind_texture();
                quad_->draw(); // DRAW FULLSCREEN QUAD
            }
            written_fbo->unbind();

            writing_pbo_group_.clear();
        }

        // BEGIN NEW WRITE
        if(!images.empty())
        {           
            for(size_t n = 0; n < images.size(); ++n)
            {
                auto pbo = get_pbo();
                pbo->begin_write(images[n]->data());

                writing_pbo_group_.push_back(pbo);
            }
        }

        // BEGIN NEW READ       
        if(written_fbo)
        {   
            written_fbo->begin_read();

            reading_fbo_ = written_fbo; 
        }
    }

    pixel_buffer_ptr get_pbo()
    {
        if(pbo_pool_.empty())
            pbo_pool_.push_back(std::make_shared<pixel_buffer>());

        auto pbo = pbo_pool_.front();
        pbo_pool_.pop_front();

        return pixel_buffer_ptr(pbo.get(), [=](pixel_buffer*){pbo_pool_.push_back(pbo);});
    }

    frame_buffer_ptr get_fbo()
    {
        if(fbo_pool_.empty())
            fbo_pool_.push_back(std::make_shared<frame_buffer>());

        auto fbo = fbo_pool_.front();
        fbo_pool_.pop_front();

        return frame_buffer_ptr(fbo.get(), [=](frame_buffer*){fbo_pool_.push_back(fbo);});
    }

    std::vector<pixel_buffer_ptr>   writing_pbo_group_;
    frame_buffer_ptr                reading_fbo_;

    std::deque<pixel_buffer_ptr> pbo_pool_;
    std::deque<frame_buffer_ptr> fbo_pool_;
};

EDIT:

Did some profiling. Most cpu time seems to be spent in begin_write();

Can't see anything wrong with it though...

void begin_write(void* src)
{
    texture_.bind();
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
    glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
    void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
    assert(ptr); 
    memcpy(ptr, src, image_size);
    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
+1  A: 

Your bottleneck is the memory transfer from the CPU to the GPU. GPU allows you to boost performance when you need to do many parallel calculations on the same small amount of data that is stored in the GPU memory (for example draw the same scene 30 frames per second). When you need to transfer data back and forth from the main memory and GPU memory, most of the time is wasted on exactly that.

ybungalobill
I believe your analysis is correct. My question then is how to reduce the cpu -> gpu transfer overhead.
ronag
By storing more data and generating whatever possible directly on the GPU. That depends on your application. Not every algorithm is appropriate to be run on the GPU. If all you want is to blend N big images once and forget about them, then GPU is not for you. If you have N images and then you apply some complex composition while the user keeps changing the parameters (e.g. image editing software) then you can load them to the GPU once and use many times.
ybungalobill
What if i use an pbo as back store for the images right away? Thus eliminating the need for an extra copy. Is writing to glMapped memory slower than regular memory?
ronag
There is no magic here. The gl*Buffer API just allows you to allocate memory on the GPU. By calling glMapBuffer you just request the drivers to assign an address for that memory in your process address space. When you write there the OS/Driver transfers it to the GPU. The performance of this depends on the bandwidth of the PCI bus (or whatever way the GPU is connected).
ybungalobill
A: 

Try altering the pixelformat (i.e. BGRA instead of ARGB).

I'm not sure of the specifics, but a mismatch in what the card actually uses and what you want could produce disastrous performance.

(It's hard to help more without info on your dataset, the algorithms you're running and some hard benchmark data...)

Marcus Lindblom
I'm using BGRA. If tried googling for opengl native texture format without success. What is the native format?
ronag
Oh. Your code said GL_RGBA8 in one place. The native format depends on the card you're using (OpenGL doesn't have one per-se, it's implementation dependent). Best way is to benchmark, to see if it's any different.
Marcus Lindblom