diff --git a/.gitignore b/.gitignore index 82afe238f51..1232e0e20fb 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ venv/ # Database treeherder_backup_* +.*backup.dump # Caches .eslintcache diff --git a/docker-compose.yml b/docker-compose.yml index bb0402e4d4a..1446f8613df 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -66,6 +66,8 @@ services: working_dir: /app environment: BACKEND: http://backend:8000 + COREPACK_INTEGRITY_KEYS: '0' + CI: 'true' command: sh -c "corepack enable && corepack prepare pnpm@9.15.0 --activate && pnpm install && pnpm start --host 0.0.0.0" volumes: - .:/app diff --git a/treeherder/etl/git_pushlog.py b/treeherder/etl/git_pushlog.py new file mode 100644 index 00000000000..a2679a97e12 --- /dev/null +++ b/treeherder/etl/git_pushlog.py @@ -0,0 +1,152 @@ +import logging + +import newrelic.agent +from django.core.cache import cache + +from treeherder.etl.common import to_timestamp +from treeherder.etl.push import store_push +from treeherder.etl.revision_mapper import parse_github_url +from treeherder.model.models import Repository +from treeherder.utils.github import get_all_commits, get_commit + +logger = logging.getLogger(__name__) +ONE_WEEK_IN_SECONDS = 604800 + + +class GitPushFetchError(Exception): + """Raised when fetching or parsing a git push fails.""" + + +def _transform_github_commit(commit_data): + """Transform a GitHub commit API response into the standard push dict format. + + Returns: + { + "revision": "<40-char sha>", + "author": "email@example.com", + "push_timestamp": , + "revisions": [{"revision": "...", "author": "...", "comment": "..."}, ...] + } + """ + commit_info = commit_data["commit"] + author_name = commit_info["author"]["name"] + author_email = commit_info["author"]["email"] + return { + "revision": commit_data["sha"], + "author": author_email, + "push_timestamp": to_timestamp(commit_info["committer"]["date"]), + "revisions": [ + { + "revision": commit_data["sha"], + "author": f"{author_name} <{author_email}>", + "comment": commit_info["message"], + } + ], + } + + +def fetch_git_push(git_url, revision): + """Fetch a single commit's data from GitHub and return as a standard push dict. + + Args: + git_url: GitHub repository URL (e.g., "https://github.com/owner/repo") + revision: The git commit SHA to fetch + + Returns: + Standard push dict suitable for store_push() + + Raises: + GitPushFetchError: If the fetch fails + """ + owner, repo = parse_github_url(git_url) + try: + data = get_commit(owner, repo, revision) + except Exception as e: + raise GitPushFetchError( + f"Failed to fetch git commit {revision} from {owner}/{repo}: {e}" + ) from e + + return _transform_github_commit(data) + + +class GitPushlogProcess: + """Mirrors HgPushlogProcess but fetches from the GitHub commits API. + + Used for scheduled polling of git-based repositories. + """ + + def extract(self, owner, repo, branch, since=None): + """Fetch commits from GitHub commits API.""" + params = {"sha": branch, "per_page": 100} + if since: + params["since"] = since + try: + return get_all_commits(owner, repo, params) + except Exception as e: + logger.warning( + "Failed to fetch git commits from %s/%s branch %s: %s", + owner, + repo, + branch, + e, + ) + raise + + def transform_commit(self, commit_data): + """Transform a GitHub commit into the standard push dict format.""" + return _transform_github_commit(commit_data) + + def run(self, git_url, branch, repository_name, since_timestamp=None): + """Fetch and store new pushes from Git. + + Similar caching pattern to HgPushlogProcess: caches the timestamp + of the last processed commit to enable incremental fetching. + """ + cache_key = f"{repository_name}:last_git_commit_time" + if not since_timestamp: + since_timestamp = cache.get(cache_key) + + owner, repo = parse_github_url(git_url) + newrelic.agent.add_custom_attribute("repo_name", repository_name) + newrelic.agent.add_custom_attribute("git_url", git_url) + + commits = self.extract(owner, repo, branch, since=since_timestamp) + + if not commits: + return None + + repository = Repository.objects.get(name=repository_name) + errors = [] + latest_timestamp = since_timestamp + + # GitHub returns commits newest-first; process in chronological order + for commit_data in reversed(commits): + try: + push_data = self.transform_commit(commit_data) + store_push(repository, push_data) + + commit_time = commit_data["commit"]["committer"]["date"] + if not latest_timestamp or commit_time > latest_timestamp: + latest_timestamp = commit_time + except Exception: + logger.exception( + "Error storing git push for %s: %s", + repository_name, + commit_data.get("sha", "unknown"), + ) + newrelic.agent.notice_error() + errors.append(commit_data.get("sha", "unknown")) + + if latest_timestamp: + cache.set(cache_key, latest_timestamp, ONE_WEEK_IN_SECONDS) + + top_revision = commits[0]["sha"] if commits else None + + if errors: + logger.warning( + "Errors processing %d git commits for %s", + len(errors), + repository_name, + ) + + return top_revision diff --git a/treeherder/etl/job_loader.py b/treeherder/etl/job_loader.py index 1e346041740..0f28d242e65 100644 --- a/treeherder/etl/job_loader.py +++ b/treeherder/etl/job_loader.py @@ -11,7 +11,7 @@ from treeherder.etl.jobs import store_job_data from treeherder.etl.schema import get_json_schema from treeherder.etl.taskcluster_pulse.handler import ignore_task -from treeherder.model.models import Push, Repository +from treeherder.model.models import Push, Repository, RevisionMapping from treeherder.utils.taskcluster import get_task_definition logger = logging.getLogger(__name__) @@ -109,6 +109,22 @@ def validate_revision(self, repository, pulse_job): ) if not Push.objects.filter(**filter_kwargs).exists(): + # For repos transitioning from hg to git, the push may be stored + # under a git revision while the task references an hg revision. + # Check the revision mapping to find the corresponding git push. + mapping = RevisionMapping.objects.filter( + repository=repository, hg_revision=revision + ).first() + if ( + mapping + and Push.objects.filter( + repository=repository, revision=mapping.git_revision + ).exists() + ): + # Rewrite the origin revision so downstream code uses the git SHA + pulse_job["origin"]["revision"] = mapping.git_revision + return + (real_task_id, _) = task_and_retry_ids(pulse_job["taskId"]) project = pulse_job["origin"]["project"] repository = Repository.objects.get(name=project) diff --git a/treeherder/etl/management/commands/backfill_git_pushes.py b/treeherder/etl/management/commands/backfill_git_pushes.py new file mode 100644 index 00000000000..ac0215a1c69 --- /dev/null +++ b/treeherder/etl/management/commands/backfill_git_pushes.py @@ -0,0 +1,251 @@ +import logging +import time + +from django.core.management.base import BaseCommand +from django.db import transaction + +from treeherder.etl.revision_mapper import RevisionMapper +from treeherder.model.models import Commit, Job, Push, Repository, RevisionMapping +from treeherder.utils.queryset import chunked_qs + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + "Re-map push/commit revisions from Mercurial SHAs to Git SHAs " + "for repositories migrating from hg to git. Requires the repository " + "to have a git_url configured." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--project", + help="Specific repository name to process (default: all with git_url set)", + ) + parser.add_argument( + "--chunk-size", + type=int, + default=500, + help="Number of pushes to process per chunk (default: 500)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Report what would be done without making changes", + ) + parser.add_argument( + "--mapfile", + help="Path to a cinnabar mapfile for bulk SHA mapping (format: per line)", + ) + parser.add_argument( + "--local-git-repo", + help="Path to a local git clone for extracting hg revision metadata from commit messages", + ) + parser.add_argument( + "--rate-limit", + type=int, + default=4000, + help="Max GitHub API requests per hour when using API-based mapping (default: 4000)", + ) + parser.add_argument( + "--resume-from-id", + type=int, + default=0, + help="Resume processing from this Push ID (useful for interrupted runs)", + ) + parser.add_argument( + "--flip-dvcs-type", + action="store_true", + help="After successful re-mapping of all pushes, update the repository " + "to dvcs_type='git' and url=git_url", + ) + + def handle(self, *args, **options): + repos = Repository.objects.filter( + dvcs_type="hg", + git_url__isnull=False, + active_status="active", + ) + if options["project"]: + repos = repos.filter(name=options["project"]) + + if not repos.exists(): + self.stderr.write("No repositories found matching criteria.") + return + + for repo in repos: + self.stdout.write(f"\n{'=' * 60}") + self.stdout.write(f"Processing repository: {repo.name}") + self.stdout.write(f" hg URL: {repo.url}") + self.stdout.write(f" git URL: {repo.git_url}") + self.stdout.write(f"{'=' * 60}") + + mapper = RevisionMapper(repo) + + # Step 1: Populate revision mappings if a mapfile or local repo is provided + self._populate_mappings(mapper, options) + + # Step 2: Re-map push revisions + stats = self._remap_pushes(repo, mapper, options) + + # Step 3: Report results + self._report_results(repo, stats, options) + + def _populate_mappings(self, mapper, options): + """Populate the RevisionMapping table from available sources.""" + if options["mapfile"]: + self.stdout.write(f"Loading mappings from mapfile: {options['mapfile']}") + if not options["dry_run"]: + count = mapper.populate_from_mapfile(options["mapfile"]) + self.stdout.write(f" Loaded {count} mappings from mapfile") + else: + self.stdout.write(" [dry-run] Would load mappings from mapfile") + + if options["local_git_repo"]: + self.stdout.write(f"Loading mappings from local git repo: {options['local_git_repo']}") + if not options["dry_run"]: + count = mapper.populate_from_git_log(options["local_git_repo"]) + self.stdout.write(f" Loaded {count} mappings from git log") + else: + self.stdout.write(" [dry-run] Would load mappings from git log") + + existing_count = RevisionMapping.objects.filter(repository=mapper.repository).count() + self.stdout.write(f" Total mappings in database: {existing_count}") + + def _remap_pushes(self, repo, mapper, options): + """Re-map Push and Commit revisions from hg to git SHAs.""" + stats = {"total": 0, "remapped": 0, "merged": 0, "failed": 0, "skipped": 0} + + queryset = Push.objects.filter(repository=repo) + if options["resume_from_id"]: + queryset = queryset.filter(id__gte=options["resume_from_id"]) + + # Track API calls for rate limiting + api_calls = 0 + rate_limit = options["rate_limit"] + rate_limit_interval = 3600.0 / rate_limit if rate_limit > 0 else 0 + + for chunk in chunked_qs( + queryset, chunk_size=options["chunk_size"], fields=["id", "revision"] + ): + if not chunk: + break + + for push in chunk: + stats["total"] += 1 + + # Look up git SHA for this push's hg revision + git_revision = mapper.map_hg_to_git(push.revision) + + if not git_revision: + # If no mapfile/local-repo mapping exists, try API + # (this counts against rate limit) + if rate_limit_interval > 0: + time.sleep(rate_limit_interval) + api_calls += 1 + + git_revision = mapper.map_hg_to_git(push.revision) + if not git_revision: + stats["failed"] += 1 + if stats["failed"] <= 20: + logger.warning( + "No git mapping for push %d (rev %s) in %s", + push.id, + push.revision, + repo.name, + ) + continue + + if git_revision == push.revision: + # Already a git revision (or identical SHA) + stats["skipped"] += 1 + continue + + # Perform the re-mapping + result = self._remap_push(push, git_revision, mapper, options["dry_run"]) + stats[result] += 1 + + self.stdout.write( + f" Progress: {stats['total']} processed, " + f"{stats['remapped']} remapped, {stats['merged']} merged, " + f"{stats['failed']} failed, {stats['skipped']} skipped " + f"(last push ID: {chunk[-1].id})" + ) + + return stats + + def _remap_push(self, push, git_revision, mapper, dry_run): + """Re-map a single push's revision from hg to git. + + Returns 'remapped' or 'merged' depending on the action taken. + """ + # Check if a push with the git revision already exists + existing = Push.objects.filter(repository=push.repository, revision=git_revision).first() + + if existing and existing.id != push.id: + # A push with this git SHA already exists (ingested via git). + # Merge: move jobs/commits from the hg push to the git push. + if not dry_run: + with transaction.atomic(): + Job.objects.filter(push=push).update(push=existing) + + for commit in push.commits.all(): + git_commit_rev = mapper.map_hg_to_git(commit.revision) + if ( + git_commit_rev + and not Commit.objects.filter( + push=existing, revision=git_commit_rev + ).exists() + ): + commit.revision = git_commit_rev + commit.push = existing + commit.save() + # If the git commit already exists on the target push, + # the old commit will be deleted with the push below. + + push.delete() + return "merged" + else: + # Update revision in place + if not dry_run: + with transaction.atomic(): + Push.objects.filter(id=push.id).update(revision=git_revision) + + for commit in push.commits.all(): + git_commit_rev = mapper.map_hg_to_git(commit.revision) + if git_commit_rev: + Commit.objects.filter(id=commit.id).update(revision=git_commit_rev) + return "remapped" + + def _report_results(self, repo, stats, options): + """Report final results and optionally flip the repo to git.""" + self.stdout.write(f"\nResults for {repo.name}:") + self.stdout.write(f" Total pushes processed: {stats['total']}") + self.stdout.write(f" Successfully remapped: {stats['remapped']}") + self.stdout.write(f" Merged with existing: {stats['merged']}") + self.stdout.write(f" Already correct: {stats['skipped']}") + self.stdout.write(f" Failed (no mapping): {stats['failed']}") + + if options["dry_run"]: + self.stdout.write(" [dry-run] No changes were made.") + return + + if stats["failed"] > 0: + self.stderr.write( + f" WARNING: {stats['failed']} pushes could not be mapped. " + "Re-run with --mapfile or --local-git-repo to provide mappings." + ) + if options["flip_dvcs_type"]: + self.stderr.write(" Skipping dvcs_type flip due to unmapped pushes.") + return + + if options["flip_dvcs_type"]: + repo.dvcs_type = "git" + repo.url = repo.git_url + repo.branch = repo.git_branch or "main" + repo.save() + self.stdout.write( + f" Flipped {repo.name} to dvcs_type='git', " + f"url='{repo.git_url}', branch='{repo.branch}'" + ) diff --git a/treeherder/etl/management/commands/ingest.py b/treeherder/etl/management/commands/ingest.py index a4d8a1c4f17..6d6d00eeb1f 100644 --- a/treeherder/etl/management/commands/ingest.py +++ b/treeherder/etl/management/commands/ingest.py @@ -350,9 +350,52 @@ def ingest_push(project, revision, fetch_push_id=None): pulse = github_push_to_pulse(_repo, revision) PushLoader().process(pulse["payload"], pulse["exchange"], _repo["tc_root_url"]) else: + # For hg repos with a git_url configured, try git-first ingestion + repo_obj = Repository.objects.get(name=project) + if repo_obj.git_url: + try: + _ingest_git_first(repo_obj, revision) + return + except Exception: + logger.warning( + "Git-first ingest failed for %s, falling back to hg", + project, + exc_info=True, + ) _ingest_hg_push(project, revision) +def _ingest_git_first(repo_obj, revision): + """Attempt to ingest a push from the Git source for a transitioning repo. + + The revision can be either a git SHA or an hg revision. We first check + if it exists directly in the git repo; if not, we try hg-to-git mapping. + """ + from treeherder.etl.git_pushlog import fetch_git_push + from treeherder.etl.push import store_push_data + from treeherder.etl.revision_mapper import RevisionMapper + + mapper = RevisionMapper(repo_obj) + + # First, check if the revision is already a valid git SHA + if mapper.verify_revision_in_git(revision): + git_revision = revision + else: + # Try hg-to-git mapping + git_revision = mapper.map_hg_to_git(revision) + if not git_revision: + raise ValueError(f"No git mapping found for hg revision {revision} in {repo_obj.name}") + + push_data = fetch_git_push(repo_obj.git_url, git_revision) + store_push_data(repo_obj, [push_data]) + logger.info( + "Ingested push via Git for %s: revision %s (git %s)", + repo_obj.name, + revision, + git_revision, + ) + + def ingest_git_pushes(project, dry_run=False): """ This method takes all commits for a repo from Github and determines which ones are considered @@ -417,7 +460,9 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( - "ingestion_type", nargs=1, help="Type of ingestion to do: [task|hg-push|git-commit|pr]" + "ingestion_type", + nargs=1, + help="Type of ingestion to do: [task|hg-push|git-commit|pr]", ) parser.add_argument("-p", "--project", help="Hg repository to query (e.g. autoland)") parser.add_argument("-c", "--commit", "-r", "--revision", help="Commit/revision to import") @@ -451,7 +496,9 @@ def add_arguments(self, parser): help="Do not make changes to the database", ) parser.add_argument( - "--last-n-pushes", type=int, help="fetch the last N pushes from the repository" + "--last-n-pushes", + type=int, + help="fetch the last N pushes from the repository", ) def handle(self, *args, **options): diff --git a/treeherder/etl/push.py b/treeherder/etl/push.py index d2c3cd67a33..9d2bdbdf69d 100644 --- a/treeherder/etl/push.py +++ b/treeherder/etl/push.py @@ -25,7 +25,10 @@ def store_push(repository, push_dict): Commit.objects.update_or_create( push=push, revision=revision["revision"], - defaults={"author": revision["author"], "comments": revision["comment"]}, + defaults={ + "author": revision["author"], + "comments": revision["comment"], + }, ) diff --git a/treeherder/etl/push_loader.py b/treeherder/etl/push_loader.py index f487af595f0..26427630312 100644 --- a/treeherder/etl/push_loader.py +++ b/treeherder/etl/push_loader.py @@ -39,17 +39,61 @@ def process(self, message_body, exchange, root_url): ) newrelic.agent.record_custom_event("skip_unknown_repository", repo_info) logger.warning( - "Skipping unsupported repo: %s %s", transformer.repo_url, transformer.branch + "Skipping unsupported repo: %s %s", + transformer.repo_url, + transformer.branch, ) return transformed_data = transformer.transform(repo.name) + # Git-first: for transitioning hg repos with a git_url configured, + # attempt to fetch/map the push data from Git before storing. + if repo.git_url and repo.dvcs_type == "hg": + transformed_data = self._try_git_first(repo, transformed_data) + logger.info( f"Storing push for repository '{repo.name}' revision '{transformed_data['revision']}' branch '{transformer.branch}' url {transformer.repo_url}", ) store_push_data(repo, [transformed_data]) + def _try_git_first(self, repo, hg_push_data): + """Try to fetch equivalent push data from Git, mapping hg revisions to git. + + If the mapping succeeds, returns git-sourced push data. + If it fails for any reason, returns the original hg push data as fallback. + """ + try: + from treeherder.etl.git_pushlog import fetch_git_push + from treeherder.etl.revision_mapper import RevisionMapper + + mapper = RevisionMapper(repo) + git_revision = mapper.map_hg_to_git(hg_push_data["revision"]) + + if git_revision: + git_push_data = fetch_git_push(repo.git_url, git_revision) + logger.info( + "Used Git source for push %s->%s in %s", + hg_push_data["revision"], + git_revision, + repo.name, + ) + return git_push_data + else: + logger.info( + "No git mapping found for %s in %s, using hg data", + hg_push_data["revision"], + repo.name, + ) + except Exception: + logger.warning( + "Git fetch failed for %s in %s, using Hg fallback", + hg_push_data["revision"], + repo.name, + exc_info=True, + ) + return hg_push_data + def get_transformer_class(self, exchange): if "github" in exchange: if exchange.endswith("push"): @@ -100,7 +144,8 @@ def process_push(self, push_data): { "comment": commit["commit"]["message"], "author": "{} <{}>".format( - commit["commit"]["author"]["name"], commit["commit"]["author"]["email"] + commit["commit"]["author"]["name"], + commit["commit"]["author"]["email"], ), "revision": commit["sha"], } diff --git a/treeherder/etl/revision_mapper.py b/treeherder/etl/revision_mapper.py new file mode 100644 index 00000000000..2dd4ef367f2 --- /dev/null +++ b/treeherder/etl/revision_mapper.py @@ -0,0 +1,196 @@ +import logging +import re +import subprocess + +from treeherder.model.models import RevisionMapping +from treeherder.utils.github import get_commit + +logger = logging.getLogger(__name__) + +# Pattern to extract hg revision from git commit messages. +# Mozilla's hg-to-git migration tools embed the hg changeset ID in commit metadata, +# e.g., "Source-Revision: abc123..." or "hg-hierarchical-rev: abc123..." +SOURCE_REVISION_PATTERNS = [ + re.compile(r"Source-Revision:\s*([0-9a-f]{40})", re.IGNORECASE), + re.compile(r"hg-hierarchical-rev:\s*([0-9a-f]{40})", re.IGNORECASE), + re.compile(r"Differential Revision:.*\n.*\n?.*([0-9a-f]{40})", re.IGNORECASE), +] + + +def parse_github_url(git_url): + """Extract (owner, repo) from a GitHub URL. + + Supports: + - https://github.com/owner/repo + - https://github.com/owner/repo.git + """ + url = git_url.rstrip("/").replace(".git", "") + parts = url.split("/") + return parts[-2], parts[-1] + + +class RevisionMapper: + """Maps Mercurial changeset node IDs to Git commit SHAs and vice versa.""" + + def __init__(self, repository): + self.repository = repository + self.owner, self.repo = parse_github_url(repository.git_url) + + def map_hg_to_git(self, hg_revision): + """Look up or discover the git SHA for an hg revision. + + Returns the git SHA string, or None if no mapping can be found. + """ + # 1. Check the RevisionMapping table + mapping = RevisionMapping.objects.filter( + repository=self.repository, hg_revision=hg_revision + ).first() + if mapping: + return mapping.git_revision + + # 2. Try to find via commit message metadata in git + git_sha = self._find_by_commit_metadata(hg_revision) + if git_sha: + RevisionMapping.objects.update_or_create( + repository=self.repository, + hg_revision=hg_revision, + defaults={"git_revision": git_sha}, + ) + return git_sha + + return None + + def map_git_to_hg(self, git_revision): + """Reverse lookup: git SHA -> hg node ID.""" + mapping = RevisionMapping.objects.filter( + repository=self.repository, git_revision=git_revision + ).first() + return mapping.hg_revision if mapping else None + + def _find_by_commit_metadata(self, hg_revision): + """Search GitHub for a commit whose message contains the hg revision. + + Uses the GitHub search API to find commits that reference the hg SHA + in their commit message (e.g., via Source-Revision trailers). + """ + try: + # GitHub commit search: search commit messages for the hg SHA + from treeherder.utils.github import fetch_api + + results = fetch_api( + f"search/commits?q=repo:{self.owner}/{self.repo}+{hg_revision}", + ) + if results.get("total_count", 0) > 0: + for item in results["items"]: + message = item["commit"]["message"] + if hg_revision in message: + return item["sha"] + except Exception: + logger.debug( + "GitHub commit search failed for hg rev %s in %s/%s", + hg_revision, + self.owner, + self.repo, + ) + return None + + def populate_from_mapfile(self, mapfile_path, chunk_size=1000): + """Bulk-load mappings from a cinnabar mapfile. + + Expected format: one mapping per line, " " + """ + batch = [] + loaded = 0 + with open(mapfile_path) as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + parts = line.split() + if len(parts) < 2: + continue + git_sha, hg_node = parts[0], parts[1] + batch.append( + RevisionMapping( + repository=self.repository, + hg_revision=hg_node, + git_revision=git_sha, + ) + ) + if len(batch) >= chunk_size: + RevisionMapping.objects.bulk_create(batch, ignore_conflicts=True) + loaded += len(batch) + logger.info("Loaded %d revision mappings so far", loaded) + batch = [] + if batch: + RevisionMapping.objects.bulk_create(batch, ignore_conflicts=True) + loaded += len(batch) + logger.info("Finished loading %d revision mappings from mapfile", loaded) + return loaded + + def populate_from_git_log(self, local_git_repo_path): + """Populate mappings by scanning git log for hg revision metadata. + + Requires a local clone of the git repo. Parses commit messages for + Source-Revision or similar trailers that embed the hg changeset ID. + """ + result = subprocess.run( + ["git", "-C", local_git_repo_path, "log", "--format=%H %B%x00"], + capture_output=True, + text=True, + check=True, + ) + batch = [] + loaded = 0 + for entry in result.stdout.split("\0"): + entry = entry.strip() + if not entry: + continue + # First 40 chars are the git SHA, rest is commit message + git_sha = entry[:40] + message = entry[41:] + hg_sha = self._extract_hg_revision_from_message(message) + if hg_sha: + batch.append( + RevisionMapping( + repository=self.repository, + hg_revision=hg_sha, + git_revision=git_sha, + ) + ) + if len(batch) >= 1000: + RevisionMapping.objects.bulk_create(batch, ignore_conflicts=True) + loaded += len(batch) + logger.info("Loaded %d revision mappings so far", loaded) + batch = [] + if batch: + RevisionMapping.objects.bulk_create(batch, ignore_conflicts=True) + loaded += len(batch) + logger.info("Finished loading %d revision mappings from git log", loaded) + return loaded + + @staticmethod + def _extract_hg_revision_from_message(message): + """Extract an hg revision SHA from a git commit message.""" + for pattern in SOURCE_REVISION_PATTERNS: + match = pattern.search(message) + if match: + return match.group(1) + return None + + def verify_revision_in_git(self, revision): + """Check if a revision SHA exists in the Git repo via GitHub API.""" + try: + get_commit(self.owner, self.repo, revision) + return True + except Exception: + return False + + def verify_revision_in_local_git(self, local_git_repo_path, revision): + """Check if a revision exists in a local git clone.""" + result = subprocess.run( + ["git", "-C", local_git_repo_path, "cat-file", "-t", revision], + capture_output=True, + text=True, + ) + return result.returncode == 0 and result.stdout.strip() == "commit" diff --git a/treeherder/etl/tasks/pushlog_tasks.py b/treeherder/etl/tasks/pushlog_tasks.py index 20ff3f8cf71..6b47c53fadb 100644 --- a/treeherder/etl/tasks/pushlog_tasks.py +++ b/treeherder/etl/tasks/pushlog_tasks.py @@ -1,17 +1,30 @@ +import logging + import newrelic.agent from celery import shared_task from treeherder.etl.pushlog import HgPushlogProcess from treeherder.model.models import Repository +logger = logging.getLogger(__name__) + @shared_task(name="fetch-push-logs") def fetch_push_logs(): """ - Run several fetch_hg_push_log subtasks, one per repository + Run several fetch_hg_push_log or fetch_git_push_log subtasks, one per repository. + + For hg repos that have a git_url configured, dispatches to the git-first + polling task which tries Git and falls back to Hg. """ for repo in Repository.objects.filter(dvcs_type="hg", active_status="active"): - fetch_hg_push_log.apply_async(args=(repo.name, repo.url), queue="pushlog") + if repo.git_url: + fetch_git_push_log.apply_async( + args=(repo.name, repo.git_url, repo.git_branch or "main", repo.url), + queue="pushlog", + ) + else: + fetch_hg_push_log.apply_async(args=(repo.name, repo.url), queue="pushlog") @shared_task(name="fetch-hg-push-logs", soft_time_limit=10 * 60) @@ -22,3 +35,27 @@ def fetch_hg_push_log(repo_name, repo_url): newrelic.agent.add_custom_attribute("repo_name", repo_name) process = HgPushlogProcess() process.run(repo_url + "/json-pushes/?full=1&version=2", repo_name) + + +@shared_task(name="fetch-git-push-logs", soft_time_limit=10 * 60) +def fetch_git_push_log(repo_name, git_url, git_branch, hg_url_fallback): + """ + Fetch push data from Git (GitHub), falling back to Hg on failure. + + This task is dispatched for hg repos that have a git_url configured, + enabling a gradual migration from Hg to Git-based push ingestion. + """ + newrelic.agent.add_custom_attribute("repo_name", repo_name) + try: + from treeherder.etl.git_pushlog import GitPushlogProcess + + process = GitPushlogProcess() + process.run(git_url, git_branch, repo_name) + except Exception: + logger.warning( + "Git push fetch failed for %s, falling back to Hg", + repo_name, + exc_info=True, + ) + process = HgPushlogProcess() + process.run(hg_url_fallback + "/json-pushes/?full=1&version=2", repo_name) diff --git a/treeherder/model/migrations/0049_repository_git_fields_revision_mapping.py b/treeherder/model/migrations/0049_repository_git_fields_revision_mapping.py new file mode 100644 index 00000000000..b48e3c0c71a --- /dev/null +++ b/treeherder/model/migrations/0049_repository_git_fields_revision_mapping.py @@ -0,0 +1,57 @@ +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("model", "0048_alter_failureline_action"), + ] + + operations = [ + migrations.AddField( + model_name="repository", + name="git_url", + field=models.CharField(blank=True, max_length=255, null=True), + ), + migrations.AddField( + model_name="repository", + name="git_branch", + field=models.CharField( + blank=True, default="main", max_length=255, null=True + ), + ), + migrations.CreateModel( + name="RevisionMapping", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("hg_revision", models.CharField(db_index=True, max_length=40)), + ("git_revision", models.CharField(db_index=True, max_length=40)), + ( + "repository", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="model.repository", + ), + ), + ], + options={ + "db_table": "revision_mapping", + "unique_together": {("repository", "hg_revision")}, + "indexes": [ + models.Index( + fields=["repository", "git_revision"], + name="revmap_repo_git_idx", + ), + ], + }, + ), + ] diff --git a/treeherder/model/models.py b/treeherder/model/models.py index 3b2d04fc134..f4493da2fd3 100644 --- a/treeherder/model/models.py +++ b/treeherder/model/models.py @@ -109,6 +109,8 @@ class Repository(models.Model): expire_performance_data = models.BooleanField(default=True) is_try_repo = models.BooleanField(default=False) tc_root_url = models.CharField(max_length=255, null=False, db_index=True) + git_url = models.CharField(max_length=255, null=True, blank=True) + git_branch = models.CharField(max_length=255, null=True, blank=True, default="main") class Meta: db_table = "repository" @@ -202,6 +204,24 @@ def __str__(self): return f"{self.push.repository.name} {self.revision}" +class RevisionMapping(models.Model): + """Maps Mercurial changeset node IDs to Git commit SHAs for migrating repos.""" + + repository = models.ForeignKey(Repository, on_delete=models.CASCADE) + hg_revision = models.CharField(max_length=40, db_index=True) + git_revision = models.CharField(max_length=40, db_index=True) + + class Meta: + db_table = "revision_mapping" + unique_together = ("repository", "hg_revision") + indexes = [ + models.Index(fields=["repository", "git_revision"], name="revmap_repo_git_idx"), + ] + + def __str__(self): + return f"{self.repository.name} hg:{self.hg_revision[:12]} -> git:{self.git_revision[:12]}" + + class MachinePlatform(models.Model): id = models.AutoField(primary_key=True) os_name = models.CharField(max_length=25) @@ -579,7 +599,14 @@ class Meta: # queries models.Index(fields=["repository", "job_type", "start_time"]), models.Index(fields=["repository", "build_platform", "job_type", "start_time"]), - models.Index(fields=["repository", "option_collection_hash", "job_type", "start_time"]), + models.Index( + fields=[ + "repository", + "option_collection_hash", + "job_type", + "start_time", + ] + ), models.Index( fields=[ "repository", @@ -690,7 +717,10 @@ class TaskclusterMetadata(models.Model): """ job = models.OneToOneField( - Job, on_delete=models.CASCADE, primary_key=True, related_name="taskcluster_metadata" + Job, + on_delete=models.CASCADE, + primary_key=True, + related_name="taskcluster_metadata", ) task_id = models.CharField(max_length=22, validators=[MinLengthValidator(22)], db_index=True) @@ -772,7 +802,15 @@ def who(self): return "autoclassifier" @classmethod - def create(cls, *, job_id, internal_bug_id=None, bugzilla_id=None, user=None, bug_open=False): + def create( + cls, + *, + job_id, + internal_bug_id=None, + bugzilla_id=None, + user=None, + bug_open=False, + ): if (bool(internal_bug_id) ^ bool(bugzilla_id)) is False: raise ValueError("Only one of internal bug ID or Bugzilla ID must be set") if internal_bug_id: @@ -881,7 +919,10 @@ def _ensure_classification(self): return # if the failure type isn't intermittent, ignore - if self.failure_classification.name not in ["intermittent", "intermittent needs bugid"]: + if self.failure_classification.name not in [ + "intermittent", + "intermittent needs bugid", + ]: return # if the linked Job has more than one TextLogError, ignore @@ -1064,7 +1105,7 @@ def to_dict(self): "stack": self.stack, "stackwalk_stdout": self.stackwalk_stdout, "stackwalk_stderr": self.stackwalk_stderr, - "best_classification": metadata.best_classification_id if metadata else None, + "best_classification": (metadata.best_classification_id if metadata else None), "best_is_verified": metadata.best_is_verified if metadata else False, "created": self.created, "modified": self.modified, @@ -1296,7 +1337,9 @@ def verify_classification(self, classification): # None as opposed to RelatedManager. if self.metadata is None: TextLogErrorMetadata.objects.create( - text_log_error=self, best_classification=classification, best_is_verified=True + text_log_error=self, + best_classification=classification, + best_is_verified=True, ) else: self.metadata.best_classification = classification @@ -1334,18 +1377,27 @@ class TextLogErrorMetadata(models.Model): """ text_log_error = models.OneToOneField( - TextLogError, primary_key=True, related_name="_metadata", on_delete=models.CASCADE + TextLogError, + primary_key=True, + related_name="_metadata", + on_delete=models.CASCADE, ) failure_line = models.OneToOneField( - FailureLine, on_delete=models.CASCADE, related_name="text_log_error_metadata", null=True + FailureLine, + on_delete=models.CASCADE, + related_name="text_log_error_metadata", + null=True, ) # Note that the case of best_classification = None and best_is_verified = True # has the special semantic that the line is ignored and should not be considered # for future autoclassifications. best_classification = models.ForeignKey( - ClassifiedFailure, related_name="best_for_errors", null=True, on_delete=models.SET_NULL + ClassifiedFailure, + related_name="best_for_errors", + null=True, + on_delete=models.SET_NULL, ) best_is_verified = models.BooleanField(default=False) diff --git a/treeherder/webapp/api/serializers.py b/treeherder/webapp/api/serializers.py index 1982362aed4..baa13be9234 100644 --- a/treeherder/webapp/api/serializers.py +++ b/treeherder/webapp/api/serializers.py @@ -272,9 +272,15 @@ def get_revision_count(self, push): def get_push_timestamp(self, push): return to_timestamp(push.time) + def get_is_git_revision(self, push): + return models.RevisionMapping.objects.filter( + repository=push.repository, git_revision=push.revision + ).exists() + revisions = serializers.SerializerMethodField() revision_count = serializers.SerializerMethodField() push_timestamp = serializers.SerializerMethodField() + is_git_revision = serializers.SerializerMethodField() repository_id = serializers.PrimaryKeyRelatedField(source="repository", read_only=True) class Meta: @@ -287,6 +293,7 @@ class Meta: "revision_count", "push_timestamp", "repository_id", + "is_git_revision", ] diff --git a/ui/job-view/pushes/Push.jsx b/ui/job-view/pushes/Push.jsx index e535abb0918..a3ad8c3d0d8 100644 --- a/ui/job-view/pushes/Push.jsx +++ b/ui/job-view/pushes/Push.jsx @@ -700,6 +700,7 @@ function Push({ bugSummaryMap={bugSummaryMap} widthClass="mb-3 ms-4" commitShaClass="font-monospace" + isGitRevision={push.is_git_revision} > {showPushHealthSummary && pushHealthStatus && (
@@ -751,6 +752,7 @@ function Push({ repo={currentRepo} key={tipRevision.revision} commitShaClass="font-monospace" + isGitRevision={push.is_git_revision} /> diff --git a/ui/models/repository.js b/ui/models/repository.js index 51b16efdd2c..6d1661c5fad 100644 --- a/ui/models/repository.js +++ b/ui/models/repository.js @@ -15,6 +15,14 @@ export default class RepositoryModel { this.pushLogUrl = `${this.url}/pushloghtml`; this.revisionHrefPrefix = `${this.url}/rev/`; } + + // For repos transitioning from hg to git, store git link info + // so per-push components can choose the right URL. + if (this.git_url) { + const branch = this.git_branch || 'main'; + this.gitRevisionHrefPrefix = `${this.git_url}/commit/`; + this.gitPushLogUrl = `${this.git_url}/commits/${branch}`; + } } static getList() { @@ -27,14 +35,18 @@ export default class RepositoryModel { return repos.find((repo) => repo.name === name); } - getRevisionHref(revision) { + getRevisionHref(revision, isGitRevision = false) { + if (isGitRevision && this.gitRevisionHrefPrefix) { + return `${this.gitRevisionHrefPrefix}${revision}`; + } return `${this.revisionHrefPrefix}${revision}`; } - getPushLogHref(revision) { - return this.dvcs_type === 'git' - ? this.getRevisionHref(revision) - : `${this.pushLogUrl}?changeset=${revision}`; + getPushLogHref(revision, isGitRevision = false) { + if (this.dvcs_type === 'git' || (isGitRevision && this.git_url)) { + return this.getRevisionHref(revision, isGitRevision); + } + return `${this.pushLogUrl}?changeset=${revision}`; } getPushLogRangeHref(params) { @@ -44,4 +56,11 @@ export default class RepositoryModel { ? `${this.url}/compare/${fromchange}...${tochange}` : `${this.pushLogUrl}?${new URLSearchParams(params).toString()}`; } + + getRevisionBaseUrl(isGitRevision = false) { + if (isGitRevision && this.git_url) { + return this.git_url; + } + return this.url; + } } diff --git a/ui/push-health/CommitHistory.jsx b/ui/push-health/CommitHistory.jsx index d3e04e920bc..e6dd4b13fde 100644 --- a/ui/push-health/CommitHistory.jsx +++ b/ui/push-health/CommitHistory.jsx @@ -61,7 +61,11 @@ class CommitHistory extends React.PureComponent { revision, repo: currentRepo.name, }); - const { author, push_timestamp: pushTimestamp } = currentPush; + const { + author, + push_timestamp: pushTimestamp, + is_git_revision: isGitRevision, + } = currentPush; const headerText = revisions[0].comments.split('\n')[0]; const authorMatch = author.match(/<(.*?)>+/); const authorEmail = authorMatch ? authorMatch[1] : author; @@ -86,7 +90,7 @@ class CommitHistory extends React.PureComponent { Push @@ -120,6 +124,7 @@ class CommitHistory extends React.PureComponent { key={revision.revision} commitShaClass="font-weight-bold text-secondary h6" commentFont="h6" + isGitRevision={isGitRevision} />
)} @@ -131,6 +136,7 @@ class CommitHistory extends React.PureComponent { repo={currentRepo} commitShaClass="font-weight-bold text-secondary h6" commentFont="h6" + isGitRevision={isGitRevision} /> )} {showParent && ( diff --git a/ui/shared/Revision.jsx b/ui/shared/Revision.jsx index cc2b5ae32a8..7847d6c9a1b 100644 --- a/ui/shared/Revision.jsx +++ b/ui/shared/Revision.jsx @@ -70,6 +70,7 @@ export class Revision extends React.PureComponent { bugSummaryMap, commitShaClass = 'commit-sha', commentFont = '', + isGitRevision = false, } = this.props; const comment = comments.split('\n')[0]; const bugMatches = comment.match(/-- ([0-9]+)|bug.([0-9]+)/gi); @@ -93,8 +94,8 @@ export class Revision extends React.PureComponent { visible={clipboardVisible} /> {commitRevision.substring(0, 12)} diff --git a/ui/shared/RevisionLinkify.jsx b/ui/shared/RevisionLinkify.jsx index be40b8da94a..82284a68b13 100644 --- a/ui/shared/RevisionLinkify.jsx +++ b/ui/shared/RevisionLinkify.jsx @@ -20,8 +20,12 @@ export default class RevisionLinkify extends React.Component { }, normalize: (match) => { const rev = match.text.replace('rev:', ''); + const prefix = + props.isGitRevision && props.repo.gitRevisionHrefPrefix + ? props.repo.gitRevisionHrefPrefix + : props.repo.revisionHrefPrefix; - match.url = `${props.repo.url}/rev/${rev}`; + match.url = `${prefix}${rev}`; match.text = rev; }, }); diff --git a/ui/shared/RevisionList.jsx b/ui/shared/RevisionList.jsx index 2fd859a7054..dc60032f6d0 100644 --- a/ui/shared/RevisionList.jsx +++ b/ui/shared/RevisionList.jsx @@ -18,6 +18,7 @@ export class RevisionList extends React.PureComponent { bugSummaryMap, commitShaClass = '', commentFont = '', + isGitRevision = false, } = this.props; return ( @@ -30,10 +31,14 @@ export class RevisionList extends React.PureComponent { bugSummaryMap={bugSummaryMap} commitShaClass={commitShaClass} commentFont={commentFont} + isGitRevision={isGitRevision} /> ))} {revisionCount > revisions.length && ( - + )} {children}