3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
34 """Context class for preparing and cleaning up directories.
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
38 If ``path`` ``isdir``, then it will be created on context enter.
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
46 self.preclean = preclean
51 self.rm_all(self.path)
53 self.mkdir_all(self.path)
56 def __exit__(self, exc_type, exc_value, traceback):
58 self.rm_all(self.path)
62 """Same as mkdir -p."""
63 names = os.path.split(path)
66 p = os.path.join(p, name)
71 names = Path._listdir(dir_)
73 p = os.path.join(dir_, name)
79 Path._os_func(os.mkdir, path, errno.EEXIST)
83 Path._os_func(os.rmdir, path, errno.ENOENT)
87 Path._os_func(os.remove, path, errno.ENOENT)
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
111 if os.path.islink(path):
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
122 return subdir name if and only if there exists one, otherwise raise PathException
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.xz'):
142 envs['XZ_OPT'] = '-7e'
144 elif into.endswith('.bz2'):
146 elif into.endswith('.gz'):
150 raise PathException('unknown compression type %s' % into)
151 subprocess.check_call(args, env=envs)
154 class GitHubCommitTsCache(object):
155 __cachef = 'github.commit.ts.cache'
159 Path.mkdir_all(TMPDIR_DL)
160 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
164 """Get timestamp with key ``k``."""
165 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
166 with os.fdopen(fileno) as fin:
168 fcntl.lockf(fileno, fcntl.LOCK_SH)
169 self._cache_init(fin)
171 ts = self.cache[k][0]
174 fcntl.lockf(fileno, fcntl.LOCK_UN)
178 """Update timestamp with ``k``."""
179 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
180 with os.fdopen(fileno, 'w+') as f:
182 fcntl.lockf(fileno, fcntl.LOCK_EX)
184 self.cache[k] = (v, int(time.time()))
187 fcntl.lockf(fileno, fcntl.LOCK_UN)
189 def _cache_init(self, fin):
191 k, ts, updated = line.split()
193 updated = int(updated)
194 self.cache[k] = (ts, updated)
196 def _cache_flush(self, fout):
197 cache = sorted(self.cache.items(), key=lambda a: a[1][1])
198 cache = cache[:self.__cachen]
200 os.ftruncate(fout.fileno(), 0)
201 fout.seek(0, os.SEEK_SET)
205 line = '{0} {1} {2}\n'.format(k, ts, updated)
209 class DownloadGitHubTarball(object):
210 """Download and repack archive tarabll from GitHub.
212 Compared with the method of packing after cloning the whole repo, this
213 method is more friendly to users with fragile internet connection.
215 However, there are limitations with this method
217 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
218 This affects fetching commit date for reproducible tarballs. Download
219 through the archive link is not affected.
221 - GitHub archives do not contain source codes for submodules.
223 - GitHub archives seem to respect .gitattributes and ignore pathes with
224 export-ignore attributes.
226 For the first two issues, the method will fail loudly to allow fallback to
227 clone-then-pack method.
229 As for the 3rd issue, to make sure that this method only produces identical
230 tarballs as the fallback method, we require the expected hash value to be
231 supplied. That means the first tarball will need to be prepared by the
232 clone-then-pack method
235 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
237 def __init__(self, args):
238 self.dl_dir = args.dl_dir
239 self.version = args.version
240 self.subdir = args.subdir
241 self.source = args.source
243 self._init_owner_repo()
244 self.xhash = args.hash
246 self.commit_ts = None # lazy load commit timestamp
247 self.commit_ts_cache = GitHubCommitTsCache()
248 self.name = 'github-tarball'
251 """Download and repack GitHub archive tarball."""
252 self._init_commit_ts()
253 with Path(TMPDIR_DL, keep=True) as dir_dl:
254 # fetch tarball from GitHub
255 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
256 with Path(tarball_path, isdir=False):
257 self._fetch(tarball_path)
259 d = os.path.join(dir_dl.path, self.subdir + '.untar')
260 with Path(d, preclean=True) as dir_untar:
261 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
262 dir0 = os.path.join(dir_untar.path, tarball_prefix)
263 dir1 = os.path.join(dir_untar.path, self.subdir)
265 if self._has_submodule(dir0):
266 raise self._error('Fetching submodules is not yet supported')
268 os.rename(dir0, dir1)
270 into=os.path.join(TMPDIR_DL, self.source)
271 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
273 self._hash_check(into)
277 # move to target location
278 file1 = os.path.join(self.dl_dir, self.source)
280 shutil.move(into, file1)
282 def _has_submodule(self, dir_):
283 m = os.path.join(dir_, '.gitmodules')
286 return st.st_size > 0
288 return e.errno != errno.ENOENT
290 def _init_owner_repo(self):
291 m = self.__repo_url_regex.search(self.url)
293 raise self._error('Invalid github url: {}'.format(self.url))
294 owner = m.group('owner')
295 repo = m.group('repo')
296 if repo.endswith('.git'):
301 def _init_hasher(self):
304 self.hasher = hashlib.sha256()
305 elif len(xhash) == 32:
306 self.hasher = hashlib.md5()
308 raise self._error('Requires sha256sum for verification')
311 def _hash_check(self, f):
312 with open(f, 'rb') as fin:
317 self.hasher.update(d)
318 xhash = self.hasher.hexdigest()
319 if xhash != self.xhash:
320 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
322 def _init_commit_ts(self):
323 if self.commit_ts is not None:
325 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
326 # terse while API[2] provides more verbose info such as commit diff
327 # etc. That's the main reason why API[1] is preferred: the response
328 # size is predictable.
330 # However, API[1] only accepts complete commit sha1sum as the parameter
331 # while API[2] is more liberal accepting also partial commit id and
334 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
335 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
338 'url': self._make_repo_url_path('git', 'commits', self.version),
339 'attr_path': ('committer', 'date'),
341 'url': self._make_repo_url_path('commits', self.version),
342 'attr_path': ('commit', 'committer', 'date'),
345 version_is_sha1sum = len(self.version) == 40
346 if not version_is_sha1sum:
347 apis.insert(0, apis.pop())
351 attr_path = api['attr_path']
353 ct = self.commit_ts_cache.get(url)
357 ct = self._init_commit_ts_remote_get(url, attr_path)
359 self.commit_ts_cache.set(url, ct)
361 except Exception as e:
362 reasons += '\n' + (" {}: {}".format(url, e))
363 raise self._error('Cannot fetch commit ts:{}'.format(reasons))
365 def _init_commit_ts_remote_get(self, url, attrpath):
366 resp = self._make_request(url)
368 date = json.loads(data)
369 for attr in attrpath:
371 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
372 date = date.timetuple()
373 ct = calendar.timegm(date)
376 def _fetch(self, path):
377 """Fetch tarball of the specified version ref."""
379 url = self._make_repo_url_path('tarball', ref)
380 resp = self._make_request(url)
381 with open(path, 'wb') as fout:
388 def _make_repo_url_path(self, *args):
389 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
391 url += '/' + '/'.join(args)
394 def _make_request(self, path):
395 """Request GitHub API endpoint on ``path``."""
396 url = 'https://api.github.com' + path
398 'Accept': 'application/vnd.github.v3+json',
399 'User-Agent': 'OpenWrt',
401 req = urllib.request.Request(url, headers=headers)
402 sslcontext = ssl._create_unverified_context()
403 fileobj = urllib.request.urlopen(req, context=sslcontext)
406 def _error(self, msg):
407 return DownloadGitHubError('{}: {}'.format(self.source, msg))
411 parser = argparse.ArgumentParser()
412 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
413 parser.add_argument('--url', help='Download URL')
414 parser.add_argument('--subdir', help='Source code subdir name')
415 parser.add_argument('--version', help='Source code version')
416 parser.add_argument('--source', help='Source tarball filename')
417 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
418 args = parser.parse_args()
420 method = DownloadGitHubTarball(args)
422 except Exception as ex:
423 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
424 sys.stderr.write('{}\n'.format(ex))
427 if __name__ == '__main__':