tags:

views:

25

answers:

3

I am creating a pager that returns documents from an Apache CouchDB map function from python-couchdb. This generator expression is working well, until it hits the max recursion depth. How can it be improved to move to iteration, rather than recursion?

def page(db, view_name, limit, include_docs=True, **opts):
    """
    `page` goes returns all documents of CouchDB map functions. It accepts
    all options that `couchdb.Database.view` does, however `include_docs` 
    should be omitted, because this will interfere with things.

    >>> import couchdb
    >>> db = couchdb.Server()['database']
    >>> for doc in page(db, '_all_docs', 100):
    >>>    doc
    #etc etc
    >>> del db['database']

    Notes on implementation:
      - `last_doc` is assigned on every loop, because there doesn't seem to
        be an easy way to know if something is the last item in the iteration.
    """

    last_doc = None
    for row in db.view(view_name,
                     limit=limit+1,
                     include_docs=include_docs,
                     **opts):
        last_doc = row.key, row.id
        yield row.doc
    if last_doc:
        for doc in page(db, view_name, limit,
               inc_docs=inc_docs, 
               startkey=last_doc[0], 
               startkey_docid=last_doc[1]):
          yield doc
A: 

Here's something to get you started. You didn't specify what *opts might be; if you only need startkey and startkey_docid to start the recursion, and not some other fields, then you can get rid of the extra function.

Obviously, untested.

def page_key(db, view_name, limit, startkey, startkey_docid, inc_docs=True):
    queue = [(startkey, startkey_docid)]
    while queue:
        key = queue.pop()

        last_doc = None
        for row in db.view(view_name,
                           limit=limit+1,
                           include_docs=inc_docs,
                           startkey=key[0],
                           startkey_docid=key[1]):
            last_doc = row.key, row.id
            yield row.doc

        if last_doc:
            queue.append(last_doc)

def page(db, view_name, limit, inc_docs=True, **opts):
    last_doc = None
    for row in db.view(view_name,
                       limit=limit+1,
                       include_docs=inc_docs,
                       **opts):
        last_doc = row.key, row.id
        yield row.doc

    if last_doc:
        for doc in page_key(db, view_name, limit, last_doc[0], last_doc[1], inc_docs):
            yield doc
Glenn Maynard
A: 

This is an alternative approach that I've tested (manually) on a database with >800k docs. Seems to work.

 def page2(db, view_name, limit, inc_docs=True, **opts):
     def get_batch(db=db, view_name=view_name, limit=limit, inc_docs=inc_docs, **opts):
         for row in db.view(view_name, limit=limit+1, include_docs=inc_docs, **opts):
             yield row
     last_doc = None
     total_rows = db.view(view_name, limit=1).total_rows
     batches = (total_rows / limit) + 1
     for i in xrange(batches):
         if not last_doc:
             for row in get_batch():
                 last_doc = row.key, row.id
                 yield row.doc or row # if include_docs is False, 
                                      # row.doc will be None
         else:
             for row in get_batch(startkey=last_doc[0], 
                             startkey_docid=last_doc[1]):
                 last_doc = row.key, row.id
                 yield row.doc or row
Tim McNamara
A: 

I don't use CouchDB so I had a little trouble understanding the sample code. Here's a stripped down version, which I believe works the way you want:

all_docs = range(0, 100)

def view(limit, offset):
    print "view: returning", limit, "rows starting at", offset
    return all_docs[offset:offset+limit]

def generate_by_pages(page_size):
    offset = 0
    while True:
        rowcount = 0
        for row in generate_page(page_size, offset):
            rowcount += 1
            yield row
        if rowcount == 0:
            break
        else: 
            offset += rowcount

def generate_page(page_size, offset):
    for row in view(page_size, offset):
        yield row

for r in generate_by_pages(10):
    print r

The key thing is replacing recursion with iteration. There are lots of ways to do this (I like trampolining in Python) but the above is straightforward.

Bill Gribble