Cascading and Coroutines

0 comments |

Cascading looks quite interesting. Here is a python program that does something similar to the Technical Overview seen main in the python program.

    #!/usr/bin/env python
    # encoding: utf-8
    import sys

    def input(theFile, pipe):
        """
        pushes a file a line at a time to a coroutine pipe
        """
        for line in theFile:
            pipe.send(line)
        pipe.close()

    @coroutine
    def extract(expression, pipe, group = 0):
        """
        extract the group from a regex
        """
        import re
        r = re.compile(expression)
        while True:
            line = (yield)
            match = r.search(line)
            if match:
                pipe.send(match.group(0))

    @coroutine
    def sort(pipe):
        """
        sort the input on a pipe
        """
        import heapq
        heap = []
        try:
            while True:
                line = (yield)
                heapq.heappush(heap, line)
        except GeneratorExit:
            while heap:
                pipe.send(heapq.heappop(heap))

    @coroutine
    def group(groupPipe, pipe):
        """
        sends consectutive matching lines from pipe to groupPipe
        """
        cur = None
        g = None
        while True:
            line = (yield)
            if cur is None:
                g = groupPipe(pipe)
            elif cur != line:
                g.close()
                g = groupPipe(pipe)

            g.send(line)
            cur = line

    @coroutine
    def uniq(pipe):
        """
        implements uniq -c
        """
        lines = 0
        try:
            while True:
                line = (yield)
                lines += 1
        except GeneratorExit:
            pipe.send('%s\t%s' % (lines, line))

    @coroutine
    def output(theFile):
        while True:
            line = (yield)
            theFile.write(line + '\n')

    def main():
        input(sys.stdin,
            extract( r'^([^ ]+)',
                sort(
                    group( uniq,
                        output(sys.stdout)
                    )
                )
            )
        )

    if __name__ == '__main__':
        main()

You can achieve the same results with the unix command line:

cat  access.log | cut -d ' ' -f 1 | sort | uniq -c