I'm trying to do some simple image processing using opengl. Since I couldn't find any good library that does this alrdy I've been trying to do my own solution.
I simply want to compose a few images on the gpu and then read them back. However the performance of my implementation seems almost equal to what it takes do on the cpu... something is wrong...
I've tried to follow the best practices I've found on the net. But still it's doing something wrong.
I've tried removing all the irrelevant code.
Any ideas as to why this implementation has poor performance?
int image_width = 1280;
int image_height = 720;
int image_size = image_width * image_height;
class texture
{
public:
texture()
{
glGenTextures(1, &texture_);
bind();
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, image_width, image_height, 0, GL_BGRA, GL_UNSIGNED_BYTE, NULL);
}
~texture(){ glDeleteTextures(1, &texture_); }
void bind(){ glBindTexture(GL_TEXTURE_2D, texture_); }
GLuint handle() { return texture_; }
private:
GLuint texture_;
};
typedef std::shared_ptr<texture> texture_ptr;
class pixel_buffer // pixel buffer with associated texture
{
public:
pixel_buffer()
{
glGenBuffersARB(1, &pbo_);
bind_pbo();
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
}
~pixel_buffer(){ glDeleteBuffers(1, &pbo_); }
void begin_write(void* src)
{
texture_.bind();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
assert(ptr);
memcpy(ptr, src, image_size);
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
void end_write()
{
bind_texture();
bind_pbo();
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
unbind_pbo();
}
void begin_read(GLuint buffer)
{
glReadBuffer(buffer);
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_);
glBufferData(GL_PIXEL_PACK_BUFFER_ARB, image_size, NULL, GL_STREAM_READ);
glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0));
}
void end_read(void* dest)
{
void* ptr = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
memcpy(dest, ptr, image_size);
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
unbind_pbo();
}
void bind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
void unbind_pbo(){ glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_); }
void bind_texture() { texture_.bind(); }
GLuint texture_handle() { return texture_.handle(); }
private:
texture texture_;
GLuint pbo_;
};
typedef std::shared_ptr<pixel_buffer> pixel_buffer_ptr;
class frame_buffer// frame buffer with associated pixel buffer
{
public:
frame_buffer()
{
glGenFramebuffersEXT(1, &fbo_);
bind();
pbo_.bind_texture();
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_2D, pbo_.texture_handle(), 0);
}
~frame_buffer() { glDeleteFramebuffersEXT(1, &fbo_); }
void bind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, fbo_); }
void unbind() { glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0); }
void begin_read()
{
bind();
pbo_.begin_read(GL_COLOR_ATTACHMENT0_EXT);
}
void end_read(void* dest)
{
pbo_.end_read(dest);
unbind();
}
private:
pixel_buffer pbo_;
GLuint fbo_;
};
typedef std::shared_ptr<frame_buffer> frame_buffer_ptr;
struct image_processor::implementation
{
void compose(const std::vector<image_ptr>& images)
{
// END PREVIOUS READ
if(reading_fbo_)
{
image_ptr result_image = std::make_shared<image>(image_size);
reading_fbo_->end_read(result_image->data());
output_.push(reading_result_image_);
reading_fbo_ = nullptr;
}
// END PREVIOUS WRITE
frame_buffer_ptr written_fbo;
if(!writing_pbo_group_.empty())
{
// END
written_fbo = get_fbo();
written_fbo->bind();
glClear(GL_COLOR_BUFFER_BIT);
for(size_t n = 0; n < writing_pbo_group_.size(); ++n)
{
writing_pbo_group_[n]->end_write();
writing_pbo_group_[n]->bind_texture();
quad_->draw(); // DRAW FULLSCREEN QUAD
}
written_fbo->unbind();
writing_pbo_group_.clear();
}
// BEGIN NEW WRITE
if(!images.empty())
{
for(size_t n = 0; n < images.size(); ++n)
{
auto pbo = get_pbo();
pbo->begin_write(images[n]->data());
writing_pbo_group_.push_back(pbo);
}
}
// BEGIN NEW READ
if(written_fbo)
{
written_fbo->begin_read();
reading_fbo_ = written_fbo;
}
}
pixel_buffer_ptr get_pbo()
{
if(pbo_pool_.empty())
pbo_pool_.push_back(std::make_shared<pixel_buffer>());
auto pbo = pbo_pool_.front();
pbo_pool_.pop_front();
return pixel_buffer_ptr(pbo.get(), [=](pixel_buffer*){pbo_pool_.push_back(pbo);});
}
frame_buffer_ptr get_fbo()
{
if(fbo_pool_.empty())
fbo_pool_.push_back(std::make_shared<frame_buffer>());
auto fbo = fbo_pool_.front();
fbo_pool_.pop_front();
return frame_buffer_ptr(fbo.get(), [=](frame_buffer*){fbo_pool_.push_back(fbo);});
}
std::vector<pixel_buffer_ptr> writing_pbo_group_;
frame_buffer_ptr reading_fbo_;
std::deque<pixel_buffer_ptr> pbo_pool_;
std::deque<frame_buffer_ptr> fbo_pool_;
};
EDIT:
Did some profiling. Most cpu time seems to be spent in begin_write();
Can't see anything wrong with it though...
void begin_write(void* src)
{
texture_.bind();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, image_size, 0, GL_STREAM_DRAW);
void* ptr = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
assert(ptr);
memcpy(ptr, src, image_size);
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}