diff --git a/ietf/doc/management/commands/find_github_backup_info.py b/ietf/doc/management/commands/find_github_backup_info.py index 19d95e75a..2ce36cfd5 100644 --- a/ietf/doc/management/commands/find_github_backup_info.py +++ b/ietf/doc/management/commands/find_github_backup_info.py @@ -1,45 +1,67 @@ # Copyright The IETF Trust 2020, All Rights Reserved + +import github3 + +from collections import Counter +from urllib.parse import urlparse + +from django.conf import settings from django.core.management.base import BaseCommand -from django.db.models import F from ietf.doc.models import DocExtResource from ietf.group.models import GroupExtResource from ietf.person.models import PersonExtResource +# TODO: Think more about submodules. This currently will only take top level repos, with the assumption that the clone will include arguments to grab all the submodules. +# As a consequence, we might end up pulling more than we need (or that the org or user expected) +# Make sure this is what we want. + class Command(BaseCommand): - help = ('Locate information about gihub repositories to backup') + help = ('Locate information about github repositories to backup') def handle(self, *args, **options): - info_dict = {} + if not settings.GITHUB_BACKUP_API_KEY: + # TODO: complain + return + github = github3.login(token = settings.GITHUB_BACKUP_API_KEY) + owners = dict() + repos = set() - for repo in DocExtResource.objects.filter(name__slug='github_repo'): - if not repo.value.endswith('/'): - repo.value += '/' - if repo not in info_dict: - info_dict[repo.value] = [] - for username in DocExtResource.objects.filter(name__slug='github_username', doc=F('doc')): - info_dict[repo.value].push(username.value) + for cls in (DocExtResource, GroupExtResource, PersonExtResource): + for res in cls.objects.filter(name_id__in=('github_repo','github_org')): + path_parts = urlparse(res.value).path.strip('/').split('/') + if not path_parts or not path_parts[0]: + continue - for repo in GroupExtResource.objects.filter(name__slug='github_repo'): - if not repo.value.endswith('/'): - repo.value += '/' - if repo not in info_dict: - info_dict[repo.value] = [] - for username in GroupExtResource.objects.filter(name__slug='github_username', group=F('group')): - info_dict[repo.value].push(username.value) + owner = path_parts[0] - for repo in PersonExtResource.objects.filter(name__slug='github_repo'): - if not repo.value.endswith('/'): - repo.value += '/' - if repo not in info_dict: - info_dict[repo.value] = [] - for username in PersonExtResource.objects.filter(name__slug='github_username', person=F('person')): - info_dict[repo.value].push(username.value) + if owner not in owners: + try: + gh_owner = github.user(username=owner) + owners[owner] = gh_owner + except github3.exceptions.NotFoundError: + continue - #print (json.dumps(info_dict)) - # For now, all we need are the repo names - for name in info_dict.keys(): - print(name) + if gh_owner.type in ('User', 'Organization'): + if len(path_parts) > 1: + repo = path_parts[1] + if (owner, repo) not in repos: + try: + _ = github.repository(owner,repo) + repos.add( (owner, repo) ) + except github3.exceptions.NotFoundError: + continue + else: + for repo in github.repositories_by(owner): + repos.add( (owner, repo.name) ) + + owner_types = Counter([owners[owner].type for owner in owners]) + print ("Owners:") + for key in owner_types: + print(" ",key,':',owner_types[key]) + print ("Repositories:", len(repos)) + for repo in sorted(repos): + print(" https://github.com/%s/%s" % repo ) diff --git a/requirements.txt b/requirements.txt index eff3d6e61..e07f360d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ django-widget-tweaks>=1.4.2 docutils>=0.12,!=0.15 factory-boy>=2.9.0,<3 Faker>=0.8.8,!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode. +github3.py>=1.2 hashids>=1.1.0 html2text>=2019.8.11 html5lib>=1.0.1