From 4c972f00bdca8526fc2c21a68022a17e6cbdbfac Mon Sep 17 00:00:00 2001 From: Tobias Henkel Date: Tue, 25 Feb 2020 21:19:54 +0100 Subject: [PATCH] Optimize canMerge using graphql The canMerge check is executed whenever zuul tests if a change can enter a gate pipeline. This is part of the critical path in the event handling of the scheduler and therefore must be as fast as possible. Currently this takes five requests for doing its work and also transfers large amounts of data that is unneeded: * get pull request * get branch protection settings * get commits * get status of latest commit * get check runs of latest commit Especially when Github is busy this can slow down zuul's event processing considerably. This can be optimized using graphql to only query the data we need with a single request. This reduces requests and load on Github and speeds up event processing in the scheduler. Since this is the first usage of graphql this also sets up needed testing infrastructure using graphene to mock the github api with real test data. Change-Id: I77be4f16cf7eb5c8035ce0312f792f4e8d4c3e10 --- MANIFEST.in | 1 + requirements.txt | 1 + test-requirements.txt | 1 + tests/fake_graphql.py | 166 ++++++++++++++++++++ tests/fakegithub.py | 18 +++ zuul/driver/github/githubconnection.py | 92 +++++------ zuul/driver/github/graphql/__init__.py | 113 +++++++++++++ zuul/driver/github/graphql/canmerge.graphql | 40 +++++ 8 files changed, 376 insertions(+), 56 deletions(-) create mode 100644 tests/fake_graphql.py create mode 100644 zuul/driver/github/graphql/__init__.py create mode 100644 zuul/driver/github/graphql/canmerge.graphql diff --git a/MANIFEST.in b/MANIFEST.in index 0b6aed491c..e571dfa8b6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,7 @@ include AUTHORS include ChangeLog include zuul/ansible/*/* include zuul/ansible/*/*/* +include zuul/driver/github/graphql/* include zuul/lib/ansible-config.conf include zuul/web/static/* include zuul/web/static/static/*/* diff --git a/requirements.txt b/requirements.txt index 59f67f55fa..0de982df49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,6 +29,7 @@ paho-mqtt cherrypy ws4py routes +pathspec jsonpath-rw urllib3!=1.25.4,!=1.25.5 # https://github.com/urllib3/urllib3/pull/1684 # TODO(tobiash): cheroot 8.1.0 introduced a regression when handling concurrent diff --git a/test-requirements.txt b/test-requirements.txt index f2ff59c4cd..f7ee86ed41 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -5,3 +5,4 @@ testtools>=0.9.32 PyMySQL psycopg2-binary beautifulsoup4 +graphene diff --git a/tests/fake_graphql.py b/tests/fake_graphql.py new file mode 100644 index 0000000000..66bbdfad61 --- /dev/null +++ b/tests/fake_graphql.py @@ -0,0 +1,166 @@ +# Copyright 2019 BMW Group +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from graphene import Boolean, Field, Int, List, ObjectType, String + + +class FakePageInfo(ObjectType): + end_cursor = String() + has_next_page = Boolean() + + def resolve_end_cursor(parent, info): + return 'testcursor' + + def resolve_has_next_page(parent, info): + return False + + +class FakeBranchProtectionRule(ObjectType): + pattern = String() + requiredStatusCheckContexts = List(String) + requiresApprovingReviews = Boolean() + requiresCodeOwnerReviews = Boolean() + + def resolve_pattern(parent, info): + return parent.pattern + + def resolve_requiredStatusCheckContexts(parent, info): + return parent.required_contexts + + def resolve_requiresApprovingReviews(parent, info): + return parent.require_reviews + + def resolve_requiresCodeOwnerReviews(parent, info): + return parent.require_codeowners_review + + +class FakeBranchProtectionRules(ObjectType): + nodes = List(FakeBranchProtectionRule) + + def resolve_nodes(parent, info): + return parent.values() + + +class FakeStatusContext(ObjectType): + state = String() + context = String() + + def resolve_state(parent, info): + state = parent.state.upper() + return state + + def resolve_context(parent, info): + return parent.context + + +class FakeStatus(ObjectType): + contexts = List(FakeStatusContext) + + def resolve_contexts(parent, info): + return parent + + +class FakeCheckRun(ObjectType): + name = String() + conclusion = String() + + def resolve_name(parent, info): + return parent.name + + def resolve_conclusion(parent, info): + return parent.conclusion.upper() + + +class FakeCheckRuns(ObjectType): + nodes = List(FakeCheckRun) + + def resolve_nodes(parent, info): + return parent + + +class FakeCheckSuite(ObjectType): + checkRuns = Field(FakeCheckRuns, first=Int()) + + def resolve_checkRuns(parent, info, first=None): + return parent + + +class FakeCheckSuites(ObjectType): + + nodes = List(FakeCheckSuite) + + def resolve_nodes(parent, info): + # Note: we only use a single check suite in the tests so return a + # single item to keep it simple. + return [parent] + + +class FakeCommit(ObjectType): + + class Meta: + # Graphql object type that defaults to the class name, but we require + # 'Commit'. + name = 'Commit' + + status = Field(FakeStatus) + checkSuites = Field(FakeCheckSuites, first=Int()) + + def resolve_status(parent, info): + seen = set() + result = [] + for status in parent._statuses: + if status.context not in seen: + seen.add(status.context) + result.append(status) + # Github returns None if there are no results + return result or None + + def resolve_checkSuites(parent, info, first=None): + # Tests only utilize one check suite so return all runs for that. + return parent._check_runs + + +class FakePullRequest(ObjectType): + isDraft = Boolean() + + def resolve_isDraft(parent, info): + return parent.draft + + +class FakeRepository(ObjectType): + name = String() + branchProtectionRules = Field(FakeBranchProtectionRules, first=Int()) + pullRequest = Field(FakePullRequest, number=Int(required=True)) + object = Field(FakeCommit, expression=String(required=True)) + + def resolve_name(parent, info): + org, name = parent.name.split('/') + return name + + def resolve_branchProtectionRules(parent, info, first): + return parent._branch_protection_rules + + def resolve_pullRequest(parent, info, number): + return parent.data.pull_requests.get(number) + + def resolve_object(parent, info, expression): + return parent._commits.get(expression) + + +class FakeGithubQuery(ObjectType): + repository = Field(FakeRepository, owner=String(required=True), + name=String(required=True)) + + def resolve_repository(root, info, owner, name): + return info.context.repos.get((owner, name)) diff --git a/tests/fakegithub.py b/tests/fakegithub.py index b16a8e410b..366f503d2d 100644 --- a/tests/fakegithub.py +++ b/tests/fakegithub.py @@ -19,9 +19,12 @@ import github3.exceptions import re import time +import graphene from requests import HTTPError from requests.structures import CaseInsensitiveDict +from tests.fake_graphql import FakeGithubQuery + FAKE_BASE_URL = 'https://example.com/api/v3/' @@ -536,6 +539,8 @@ class FakeGithubSession(object): def __init__(self, data): self._data = data self.headers = CaseInsensitiveDict() + self._base_url = None + self.schema = graphene.Schema(query=FakeGithubQuery) def build_url(self, *args): fakepath = '/'.join(args) @@ -554,6 +559,17 @@ class FakeGithubSession(object): # unknown entity to process return None + def post(self, url, data=None, headers=None, params=None, json=None): + + if json and json.get('query'): + query = json.get('query') + variables = json.get('variables') + result = self.schema.execute( + query, variables=variables, context=self._data) + return FakeResponse({'data': result.data}, 200) + + return FakeResponse(None, 404) + def get_repo(self, request, params=None): org, project, request = request.split('/', 2) project_name = '{}/{}'.format(org, project) @@ -569,6 +585,8 @@ class FakeBranchProtectionRule: def __init__(self): self.pattern = None self.required_contexts = [] + self.require_reviews = False + self.require_codeowners_review = False class FakeGithubData(object): diff --git a/zuul/driver/github/githubconnection.py b/zuul/driver/github/githubconnection.py index 0bb1808d79..d16728bcd8 100644 --- a/zuul/driver/github/githubconnection.py +++ b/zuul/driver/github/githubconnection.py @@ -23,6 +23,7 @@ import threading import time import re import json +from collections import OrderedDict import cherrypy import cachecontrol @@ -34,9 +35,11 @@ import jwt import requests import github3 import github3.exceptions +import github3.pulls from github3.session import AppInstallationTokenAuth from zuul.connection import BaseConnection +from zuul.driver.github.graphql import GraphQLClient from zuul.lib.gearworker import ZuulGearWorker from zuul.web.handler import BaseWebController from zuul.lib.logutil import get_annotated_logger @@ -47,6 +50,7 @@ from zuul.driver.github.githubmodel import PullRequest, GithubTriggerEvent GITHUB_BASE_URL = 'https://api.github.com' PREVIEW_JSON_ACCEPT = 'application/vnd.github.machine-man-preview+json' PREVIEW_DRAFT_ACCEPT = 'application/vnd.github.shadow-cat-preview+json' +PREVIEW_CHECKS_ACCEPT = 'application/vnd.github.antiope-preview+json' def _sign_request(body, secret): @@ -78,10 +82,18 @@ class GithubRequestLogger: self.log = get_annotated_logger(log, zuul_event_id) def log_request(self, response, *args, **kwargs): - self.log.debug('%s %s result: %s, size: %s, duration: %s', - response.request.method, response.url, - response.status_code, len(response.content), - int(response.elapsed.microseconds / 1000)) + fields = OrderedDict() + fields['result'] = response.status_code + fields['size'] = len(response.content) + fields['duration'] = int(response.elapsed.microseconds / 1000) + if response.url.endswith('/graphql'): + body = json.loads(response.request.body) + for key, value in body.get('variables', {}).items(): + fields[key] = value + info = ', '.join(['%s: %s' % (key, value) + for key, value in fields.items()]) + self.log.debug('%s %s %s', + response.request.method, response.url, info) class GithubRateLimitHandler: @@ -774,8 +786,10 @@ class GithubConnection(BaseConnection): self._log_rate_limit = False if self.server == 'github.com': + self.api_base_url = GITHUB_BASE_URL self.base_url = GITHUB_BASE_URL else: + self.api_base_url = 'https://%s/api' % self.server self.base_url = 'https://%s/api/v3' % self.server # ssl verification must default to true @@ -821,6 +835,8 @@ class GithubConnection(BaseConnection): r"^Depends-On: https://%s/.+/.+/pull/[0-9]+$" % self.server, re.MULTILINE | re.IGNORECASE) + self.graphql_client = GraphQLClient('%s/graphql' % self.api_base_url) + def toDict(self): d = super().toDict() d.update({ @@ -1509,31 +1525,24 @@ class GithubConnection(BaseConnection): github = self.getGithubClient(change.project.name, zuul_event_id=event) - # Append accept header so we get the draft status + # Append accept headers so we get the draft status and checks api self._append_accept_header(github, PREVIEW_DRAFT_ACCEPT) + self._append_accept_header(github, PREVIEW_CHECKS_ACCEPT) - owner, proj = change.project.name.split('/') - pull = github.pull_request(owner, proj, change.number) + # For performance reasons fetch all needed data for canMerge upfront + # using a single graphql call. + canmerge_data = self.graphql_client.fetch_canmerge(github, change) # If the PR is a draft it cannot be merged. - # TODO: This uses the dict instead of the pull object since github3.py - # has no support for draft PRs yet. Replace this with pull.draft when - # support has been added. - # https://github.com/sigmavirus24/github3.py/issues/926 - if pull.as_dict().get('draft', False): + if canmerge_data.get('isDraft', False): log.debug('Change %s can not merge because it is a draft', change) return False - protection = self._getBranchProtection( - change.project.name, change.branch, zuul_event_id=event) - - if not self._hasRequiredStatusChecks(allow_needs, protection, pull): + if not self._hasRequiredStatusChecks(allow_needs, canmerge_data): return False - required_reviews = protection.get( - 'required_pull_request_reviews') - if required_reviews: - if required_reviews.get('require_code_owner_reviews'): + if canmerge_data.get('requiresApprovingReviews'): + if canmerge_data.get('requiresCodeOwnerReviews'): # we need to process the reviews using code owners # TODO(tobiash): not implemented yet pass @@ -1647,14 +1656,9 @@ class GithubConnection(BaseConnection): return resp.json() - def _hasRequiredStatusChecks(self, allow_needs, protection, pull): - if not protection: - # There are no protection settings -> ok by definition - return True - - required_contexts = protection.get( - 'required_status_checks', {}).get('contexts') - + @staticmethod + def _hasRequiredStatusChecks(allow_needs, canmerge_data): + required_contexts = canmerge_data['requiredStatusCheckContexts'] if not required_contexts: # There are no required contexts -> ok by definition return True @@ -1663,33 +1667,9 @@ class GithubConnection(BaseConnection): required_contexts = set( [x for x in required_contexts if x not in allow_needs]) - # NOTE(tobiash): We cannot just take the last commit in the list - # because it is not sorted that the head is the last one in every case. - # E.g. when doing a re-merge from the target the PR head can be - # somewhere in the middle of the commit list. Thus we need to search - # the whole commit list for the PR head commit which has the statuses - # attached. - commits = list(pull.commits()) - commit = None - for c in commits: - if c.sha == pull.head.sha: - commit = c - break - # Get successful statuses - successful = set([s.context for s in commit.status().statuses - if s.state == 'success']) - - if self.app_id: - try: - # Required contexts can be fulfilled by statuses or check runs. - successful.update([cr.name for cr in commit.check_runs() - if cr.conclusion == 'success']) - except github3.exceptions.GitHubException as exc: - self.log.error( - "Unable to retrieve check runs for commit %s: %s", - commit.sha, str(exc) - ) + successful = set([s[0] for s in canmerge_data['status'].items() + if s[1] == 'SUCCESS']) # Required contexts must be a subset of the successful contexts as # we allow additional successful status contexts we don't care about. @@ -1799,11 +1779,11 @@ class GithubConnection(BaseConnection): github = self.getGithubClient( project_name, zuul_event_id=zuul_event_id ) + self._append_accept_header(github, PREVIEW_CHECKS_ACCEPT) url = github.session.build_url( "repos", project_name, "commits", sha, "check-runs") - headers = {'Accept': 'application/vnd.github.antiope-preview+json'} params = {"per_page": 100} - resp = github.session.get(url, params=params, headers=headers) + resp = github.session.get(url, params=params) resp.raise_for_status() log.debug("Got commit checks for sha %s on %s", sha, project_name) diff --git a/zuul/driver/github/graphql/__init__.py b/zuul/driver/github/graphql/__init__.py new file mode 100644 index 0000000000..8d8f1f58c4 --- /dev/null +++ b/zuul/driver/github/graphql/__init__.py @@ -0,0 +1,113 @@ +# Copyright 2020 BMW Group +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +import logging + +import pathspec +from pkg_resources import resource_string + + +def nested_get(d, *keys, default=None): + temp = d + for key in keys[:-1]: + temp = temp.get(key, {}) if temp is not None else None + return temp.get(keys[-1], default) if temp is not None else default + + +class GraphQLClient: + log = logging.getLogger('zuul.github.graphql') + + def __init__(self, url): + self.url = url + self.queries = {} + self._load_queries() + + def _load_queries(self): + self.log.debug('Loading prepared graphql queries') + query_names = [ + 'canmerge', + ] + for query_name in query_names: + self.queries[query_name] = resource_string( + __name__, '%s.graphql' % query_name).decode('utf-8') + + @staticmethod + def _prepare_query(query, variables): + data = { + 'query': query, + 'variables': variables, + } + return data + + def _fetch_canmerge(self, github, owner, repo, pull, sha): + variables = { + 'zuul_query': 'canmerge', # used for logging + 'owner': owner, + 'repo': repo, + 'pull': pull, + 'head_sha': sha, + } + query = self._prepare_query(self.queries['canmerge'], variables) + response = github.session.post(self.url, json=query) + return response.json() + + def fetch_canmerge(self, github, change): + owner, repo = change.project.name.split('/') + + data = self._fetch_canmerge(github, owner, repo, change.number, + change.patchset) + result = {} + + repository = nested_get(data, 'data', 'repository') + # Find corresponding rule to our branch + rules = nested_get(repository, 'branchProtectionRules', 'nodes', + default=[]) + matching_rule = None + for rule in rules: + pattern = pathspec.patterns.GitWildMatchPattern( + rule.get('pattern')) + match = pathspec.match_files([pattern], [change.branch]) + if match: + matching_rule = rule + break + + # If there is a matching rule, get required status checks + if matching_rule: + result['requiredStatusCheckContexts'] = matching_rule.get( + 'requiredStatusCheckContexts', []) + result['requiresApprovingReviews'] = matching_rule.get( + 'requiresApprovingReviews') + result['requiresCodeOwnerReviews'] = matching_rule.get( + 'requiresCodeOwnerReviews') + else: + result['requiredStatusCheckContexts'] = [] + + # Check for draft + pull_request = nested_get(repository, 'pullRequest') + result['isDraft'] = nested_get(pull_request, 'isDraft', default=False) + + # Add status checks + result['status'] = {} + commit = nested_get(data, 'data', 'repository', 'object') + # Status can be explicit None so make sure we work with a dict + # afterwards + status = commit.get('status') or {} + for context in status.get('contexts', []): + result['status'][context['context']] = context['state'] + + # Add check runs + for suite in nested_get(commit, 'checkSuites', 'nodes', default=[]): + for run in nested_get(suite, 'checkRuns', 'nodes', default=[]): + result['status'][run['name']] = run['conclusion'] + + return result diff --git a/zuul/driver/github/graphql/canmerge.graphql b/zuul/driver/github/graphql/canmerge.graphql new file mode 100644 index 0000000000..4bddd73dcc --- /dev/null +++ b/zuul/driver/github/graphql/canmerge.graphql @@ -0,0 +1,40 @@ +query canMergeData( + $owner: String! + $repo: String! + $pull: Int! + $head_sha: String! +) { + repository(owner: $owner, name: $repo) { + branchProtectionRules(first: 100) { + nodes { + pattern + requiredStatusCheckContexts + requiresApprovingReviews + requiresCodeOwnerReviews + } + } + pullRequest(number: $pull) { + isDraft + } + object(expression: $head_sha) { + ... on Commit { + checkSuites(first: 100) { + nodes { + checkRuns(first: 100) { + nodes { + name + conclusion + } + } + } + } + status { + contexts { + state + context + } + } + } + } + } +}