3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
34 """Context class for preparing and cleaning up directories.
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
38 If ``path`` ``isdir``, then it will be created on context enter.
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
46 self.preclean = preclean
51 self.rm_all(self.path)
53 self.mkdir_all(self.path)
56 def __exit__(self, exc_type, exc_value, traceback):
58 self.rm_all(self.path)
62 """Same as mkdir -p."""
63 names = os.path.split(path)
66 p = os.path.join(p, name)
71 names = Path._listdir(dir_)
73 p = os.path.join(dir_, name)
79 Path._os_func(os.mkdir, path, errno.EEXIST)
83 Path._os_func(os.rmdir, path, errno.ENOENT)
87 Path._os_func(os.remove, path, errno.ENOENT)
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
111 if os.path.islink(path):
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
122 return subdir name if and only if there exists one, otherwise raise PathException
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.xz'):
142 envs['XZ_OPT'] = '-7e'
144 elif into.endswith('.bz2'):
146 elif into.endswith('.gz'):
150 raise PathException('unknown compression type %s' % into)
151 subprocess.check_call(args, env=envs)
154 class GitHubCommitTsCache(object):
155 __cachef = 'github.commit.ts.cache'
159 Path.mkdir_all(TMPDIR_DL)
160 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
164 """Get timestamp with key ``k``."""
165 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
166 with os.fdopen(fileno) as fin:
168 fcntl.lockf(fileno, fcntl.LOCK_SH)
169 self._cache_init(fin)
171 ts = self.cache[k][0]
174 fcntl.lockf(fileno, fcntl.LOCK_UN)
178 """Update timestamp with ``k``."""
179 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
180 with os.fdopen(fileno, 'wb+') as f:
182 fcntl.lockf(fileno, fcntl.LOCK_EX)
184 self.cache[k] = (v, int(time.time()))
187 fcntl.lockf(fileno, fcntl.LOCK_UN)
189 def _cache_init(self, fin):
191 k, ts, updated = line.split()
193 updated = int(updated)
194 self.cache[k] = (ts, updated)
196 def _cache_flush(self, fout):
197 cache = sorted(self.cache.iteritems(), cmp=lambda a, b: b[1][1] - a[1][1])
198 cache = cache[:self.__cachen]
200 os.ftruncate(fout.fileno(), 0)
201 fout.seek(0, os.SEEK_SET)
205 line = '{0} {1} {2}\n'.format(k, ts, updated)
209 class DownloadGitHubTarball(object):
210 """Download and repack archive tarabll from GitHub.
212 Compared with the method of packing after cloning the whole repo, this
213 method is more friendly to users with fragile internet connection.
215 However, there are limitations with this method
217 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
218 This affects fetching commit date for reproducible tarballs. Download
219 through the archive link is not affected.
221 - GitHub archives do not contain source codes for submodules.
223 - GitHub archives seem to respect .gitattributes and ignore pathes with
224 export-ignore attributes.
226 For the first two issues, the method will fail loudly to allow fallback to
227 clone-then-pack method.
229 As for the 3rd issue, to make sure that this method only produces identical
230 tarballs as the fallback method, we require the expected hash value to be
231 supplied. That means the first tarball will need to be prepared by the
232 clone-then-pack method
235 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
237 def __init__(self, args):
238 self.dl_dir = args.dl_dir
239 self.version = args.version
240 self.subdir = args.subdir
241 self.source = args.source
243 self._init_owner_repo()
244 self.xhash = args.hash
246 self.commit_ts = None # lazy load commit timestamp
247 self.commit_ts_cache = GitHubCommitTsCache()
248 self.name = 'github-tarball'
251 """Download and repack GitHub archive tarball."""
252 self._init_commit_ts()
253 with Path(TMPDIR_DL, keep=True) as dir_dl:
254 # fetch tarball from GitHub
255 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
256 with Path(tarball_path, isdir=False):
257 self._fetch(tarball_path)
259 d = os.path.join(dir_dl.path, self.subdir + '.untar')
260 with Path(d, preclean=True) as dir_untar:
261 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
262 dir0 = os.path.join(dir_untar.path, tarball_prefix)
263 dir1 = os.path.join(dir_untar.path, self.subdir)
265 if self._has_submodule(dir0):
266 raise self._error('Fetching submodules is not yet supported')
268 os.rename(dir0, dir1)
270 into=os.path.join(TMPDIR_DL, self.source)
271 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
273 self._hash_check(into)
277 # move to target location
278 file1 = os.path.join(self.dl_dir, self.source)
280 shutil.move(into, file1)
282 def _has_submodule(self, dir_):
283 m = os.path.join(dir_, '.gitmodules')
286 return st.st_size > 0
288 return e.errno != errno.ENOENT
290 def _init_owner_repo(self):
291 m = self.__repo_url_regex.search(self.url)
293 raise self._error('Invalid github url: {}'.format(self.url))
294 owner = m.group('owner')
295 repo = m.group('repo')
296 if repo.endswith('.git'):
301 def _init_hasher(self):
304 self.hasher = hashlib.sha256()
305 elif len(xhash) == 32:
306 self.hasher = hashlib.md5()
308 raise self._error('Requires sha256sum for verification')
311 def _hash_check(self, f):
312 with open(f, 'rb') as fin:
317 self.hasher.update(d)
318 xhash = self.hasher.hexdigest()
319 if xhash != self.xhash:
320 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
322 def _init_commit_ts(self):
323 if self.commit_ts is not None:
325 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
326 # terse while API[2] provides more verbose info such as commit diff
327 # etc. That's the main reason why API[1] is preferred: the response
328 # size is predictable.
330 # However, API[1] only accepts complete commit sha1sum as the parameter
331 # while API[2] is more liberal accepting also partial commit id and
334 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
335 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
338 'url': self._make_repo_url_path('git', 'commits', self.version),
339 'attr_path': ('committer', 'date'),
341 'url': self._make_repo_url_path('commits', self.version),
342 'attr_path': ('commit', 'committer', 'date'),
345 version_is_sha1sum = len(self.version) == 40
346 if not version_is_sha1sum:
347 apis.insert(0, apis.pop())
350 attr_path = api['attr_path']
352 ct = self.commit_ts_cache.get(url)
356 ct = self._init_commit_ts_remote_get(url, attr_path)
358 self.commit_ts_cache.set(url, ct)
362 raise self._error('Cannot fetch commit ts: {}'.format(url))
364 def _init_commit_ts_remote_get(self, url, attrpath):
365 resp = self._make_request(url)
367 date = json.loads(data)
368 for attr in attrpath:
370 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
371 date = date.timetuple()
372 ct = calendar.timegm(date)
375 def _fetch(self, path):
376 """Fetch tarball of the specified version ref."""
378 url = self._make_repo_url_path('tarball', ref)
379 resp = self._make_request(url)
380 with open(path, 'wb') as fout:
387 def _make_repo_url_path(self, *args):
388 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
390 url += '/' + '/'.join(args)
393 def _make_request(self, path):
394 """Request GitHub API endpoint on ``path``."""
395 url = 'https://api.github.com' + path
397 'Accept': 'application/vnd.github.v3+json',
398 'User-Agent': 'OpenWrt',
400 req = urllib2.Request(url, headers=headers)
401 sslcontext = ssl._create_unverified_context()
402 fileobj = urllib2.urlopen(req, context=sslcontext)
405 def _error(self, msg):
406 return DownloadGitHubError('{}: {}'.format(self.source, msg))
410 parser = argparse.ArgumentParser()
411 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
412 parser.add_argument('--url', help='Download URL')
413 parser.add_argument('--subdir', help='Source code subdir name')
414 parser.add_argument('--version', help='Source code version')
415 parser.add_argument('--source', help='Source tarball filename')
416 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
417 args = parser.parse_args()
419 method = DownloadGitHubTarball(args)
421 except Exception as ex:
422 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
423 sys.stderr.write('{}\n'.format(ex))
426 if __name__ == '__main__':