# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import annotations import argparse import csv import json import os import re import subprocess import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import asdict, dataclass, fields from pathlib import Path from typing import TYPE_CHECKING from urllib import error, request if TYPE_CHECKING: from collections.abc import Iterable DEFAULT_OWNER = "PaddlePaddle" DEFAULT_REPO = "Paddle" HTML_COMMENT_RE = re.compile(r"", re.DOTALL) PR_NUMBER_RE = re.compile(r"\(#(\d+)\)") HEADING_RE = re.compile(r"(?m)^###\s*(.+?)\s*$") RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} PR_CATEGORIES = [ 'User Experience', 'Execute Infrastructure', 'Operator Mechanism', 'CINN', 'Custom Device', 'Performance Optimization', 'Distributed Strategy', 'Parameter Server', 'Communication Library', 'Auto Parallel', 'Inference', 'Environment Adaptation', 'Others', ] PR_TYPES = [ 'New features', 'Bug fixes', 'Improvements', 'Performance', 'BC Breaking', 'Deprecations', 'Docs', 'Devs', 'Not User Facing', 'Security', 'Others', ] @dataclass(frozen=True) class CommitRecord: commit_hash: str title: str git_author: str pr_number: int | None @dataclass(frozen=True) class PullRequestRecord: number: int title: str author: str labels: list[str] reviewers: list[str] category: str topic: str description: str @dataclass(frozen=True) class CommitRow: commit_hash: str category: str topic: str title: str pr_link: str author: str labels: str accepter_1: str accepter_2: str accepter_3: str description: str COMMIT_FIELDS = tuple(field.name for field in fields(CommitRow)) def run_git(*args: str) -> str: completed = subprocess.run( ["git", *args], check=True, text=True, capture_output=True, ) return completed.stdout.strip() def normalize_whitespace(text: str) -> str: return re.sub(r"\s+", " ", text).strip() def strip_html_comments(text: str) -> str: return HTML_COMMENT_RE.sub("", text or "") def cleanup_markdown_text(text: str) -> str: cleaned_lines = [] for line in strip_html_comments(text).replace("\r\n", "\n").splitlines(): line = line.strip() if not line: continue line = re.sub(r"^[-*+]\s*", "", line) line = re.sub(r"^\d+\.\s*", "", line) cleaned_lines.append(line) return normalize_whitespace(" ".join(cleaned_lines)) def cleanup_title(title: str) -> str: return normalize_whitespace(PR_NUMBER_RE.sub("", title or "")) def extract_pr_number(title: str) -> int | None: matches = PR_NUMBER_RE.findall(title or "") return int(matches[-1]) if matches else None def split_known_choice(raw_text: str, choices: list[str], default: str) -> str: parts = [ cleanup_markdown_text(part) for part in re.split(r'[,|\n]+', raw_text or '') ] parts = [part for part in parts if part] if not parts: return default for part in parts: for choice in choices: if part.casefold() == choice.casefold(): return choice for part in parts: lowered = part.casefold() for choice in choices: if choice.casefold() in lowered: return choice return default def label_value(labels: Iterable[str], prefix: str) -> str | None: prefix_lower = prefix.casefold() for label in labels: if label.casefold().startswith(prefix_lower): _, value = label.split(':', 1) return value.strip() return None def markdown_sections(body: str) -> dict[str, str]: cleaned = ( strip_html_comments(body).replace("\r\n", "\n").replace("\r", "\n") ) matches = list(HEADING_RE.finditer(cleaned)) sections: dict[str, str] = {} for index, match in enumerate(matches): start = match.end() end = ( matches[index + 1].start() if index + 1 < len(matches) else len(cleaned) ) sections[normalize_whitespace(match.group(1))] = cleaned[ start:end ].strip() return sections def section_value(sections: dict[str, str], name: str) -> str: for key, value in sections.items(): if key.casefold() == name.casefold(): return value return "" def resolve_category(labels: list[str], body: str) -> str: value = label_value(labels, 'release notes:') if value: return split_known_choice(value, PR_CATEGORIES, 'Others') sections = markdown_sections(body) return split_known_choice( section_value(sections, 'PR Category'), PR_CATEGORIES, 'Others' ) def resolve_topic(labels: list[str], body: str) -> str: value = label_value(labels, 'topic:') if value: return split_known_choice(value, PR_TYPES, 'Others') sections = markdown_sections(body) return split_known_choice( section_value(sections, 'PR Types'), PR_TYPES, 'Others' ) def resolve_description(body: str) -> str: sections = markdown_sections(body) return cleanup_markdown_text(section_value(sections, 'Description')) def load_token(explicit_token: str | None) -> str | None: if explicit_token: return explicit_token for env_name in ('GITHUB_API_TOKEN', 'GITHUB_TOKEN', 'GH_TOKEN'): value = os.getenv(env_name) if value: return value token_file = Path('~/.gh_tokenrc').expanduser() if token_file.exists(): match = re.search(r'github_oauth\s*=\s*(\S+)', token_file.read_text()) if match: return match.group(1) return None def collect_commits( base_ref: str, head_ref: str, use_merge_base: bool ) -> list[CommitRecord]: start_ref = base_ref if use_merge_base: start_ref = run_git('merge-base', base_ref, head_ref) raw_log = run_git( 'log', '--reverse', '--pretty=format:%H%x1f%s%x1f%an%x1e', f'{start_ref}..{head_ref}', ) if not raw_log: return [] commits = [] for record in raw_log.split('\x1e'): record = record.strip() if not record: continue commit_hash, title, author = record.split('\x1f') commits.append( CommitRecord( commit_hash=commit_hash, title=title, git_author=author, pr_number=extract_pr_number(title), ) ) return commits class PullRequestCache: def __init__(self, path: Path): self.path = path self.path.parent.mkdir(parents=True, exist_ok=True) self._data: dict[str, dict[str, object]] = {} if self.path.exists(): self._data = json.loads(self.path.read_text()) def get_many(self, numbers: Iterable[int]) -> dict[int, PullRequestRecord]: records = {} for number in numbers: cached = self._data.get(str(number)) if cached is not None: cached = dict(cached) cached.setdefault('title', f'PR #{number}') records[number] = PullRequestRecord(**cached) return records def update_many(self, records: Iterable[PullRequestRecord]) -> None: changed = False for record in records: self._data[str(record.number)] = asdict(record) changed = True if changed: self.path.write_text( json.dumps(self._data, indent=2, sort_keys=True) + '\n' ) class GitHubClient: def __init__(self, token: str | None, owner: str, repo: str): self.owner = owner self.repo = repo self.headers = {'Accept': 'application/vnd.github+json'} if token: self.headers['Authorization'] = f'bearer {token}' def _request_graphql(self, query: str) -> dict[str, object]: last_error: Exception | None = None for attempt in range(3): try: payload = json.dumps({'query': query}).encode() http_request = request.Request( 'https://api.github.com/graphql', data=payload, headers={ **self.headers, 'Content-Type': 'application/json', }, method='POST', ) with request.urlopen(http_request, timeout=60) as response: payload = json.loads(response.read().decode()) if payload.get('errors'): raise RuntimeError( f"GitHub GraphQL query failed: {payload['errors']}" ) return payload except error.HTTPError as http_error: last_error = http_error if http_error.code in RETRYABLE_STATUS_CODES and attempt < 2: time.sleep(1 + attempt) continue raise except (error.URLError, ValueError, RuntimeError) as request_error: last_error = request_error if attempt == 2: raise time.sleep(1 + attempt) assert last_error is not None raise last_error def _fetch_batch(self, batch: list[int]) -> dict[int, PullRequestRecord]: query_parts = [] for number in batch: query_parts.append( f''' pr_{number}: pullRequest(number: {number}) {{ title author {{ login }} body labels(first: 50) {{ nodes {{ name }} }} reviews(first: 100, states: APPROVED) {{ nodes {{ author {{ login }} }} }} }} ''' ) payload = self._request_graphql( f''' query {{ repository(owner: "{self.owner}", name: "{self.repo}") {{ {''.join(query_parts)} }} }} ''' ) repository = payload['data']['repository'] results: dict[int, PullRequestRecord] = {} for number in batch: node = repository.get(f'pr_{number}') if node is None: continue labels = [item['name'] for item in node['labels']['nodes']] reviewers = sorted( { review['author']['login'] for review in node['reviews']['nodes'] if review.get('author') } ) body = node.get('body') or '' results[number] = PullRequestRecord( number=number, title=node.get('title') or f'PR #{number}', author=(node.get('author') or {}).get('login', ''), labels=labels, reviewers=reviewers, category=resolve_category(labels, body), topic=resolve_topic(labels, body), description=resolve_description(body), ) return results def fetch_pull_requests( self, numbers: list[int], batch_size: int, workers: int ) -> dict[int, PullRequestRecord]: unique_numbers = sorted(set(numbers)) if not unique_numbers: return {} batches = [ unique_numbers[index : index + batch_size] for index in range(0, len(unique_numbers), batch_size) ] results: dict[int, PullRequestRecord] = {} if workers <= 1 or len(batches) == 1: for batch in batches: results.update(self._fetch_batch(batch)) return results with ThreadPoolExecutor(max_workers=workers) as executor: future_to_batch = { executor.submit(self._fetch_batch, batch): batch for batch in batches } for future in as_completed(future_to_batch): results.update(future.result()) return results def build_commit_rows( commits: list[CommitRecord], pull_requests: dict[int, PullRequestRecord], owner: str, repo: str, ) -> tuple[list[CommitRow], list[int]]: rows: list[CommitRow] = [] missing_prs: set[int] = set() for commit in commits: if commit.pr_number is None: rows.append( CommitRow( commit_hash=commit.commit_hash, category='Others', topic='Others', title=commit.title, pr_link='', author=commit.git_author, labels='', accepter_1='', accepter_2='', accepter_3='', description='', ) ) continue pr_record = pull_requests.get(commit.pr_number) if pr_record is None: missing_prs.add(commit.pr_number) rows.append( CommitRow( commit_hash=commit.commit_hash, category='Others', topic='Others', title=commit.title, pr_link=f'https://github.com/{owner}/{repo}/pull/{commit.pr_number}', author=commit.git_author, labels='', accepter_1='', accepter_2='', accepter_3='', description='', ) ) continue accepters = [*pr_record.reviewers, '', '', ''][:3] rows.append( CommitRow( commit_hash=commit.commit_hash, category=pr_record.category, topic=pr_record.topic, title=pr_record.title or commit.title, pr_link=f'https://github.com/{owner}/{repo}/pull/{commit.pr_number}', author=pr_record.author or commit.git_author, labels=','.join(pr_record.labels), accepter_1=accepters[0], accepter_2=accepters[1], accepter_3=accepters[2], description=pr_record.description, ) ) return rows, sorted(missing_prs) def ordered_values( items: Iterable[str], preferred_order: list[str] ) -> list[str]: values = {item for item in items if item} ordered = [value for value in preferred_order if value in values] extras = sorted(values - set(preferred_order), key=str.casefold) return ordered + extras def get_hash_or_pr_url(commit: CommitRow) -> str: if not commit.pr_link: return commit.commit_hash matches = re.findall( r'https://github.com/[^/]+/[^/]+/pull/(\d+)', commit.pr_link, ) if not matches: return commit.commit_hash return f'[#{matches[0]}]({commit.pr_link})' def markdown_entry_text(commit: CommitRow) -> str: return cleanup_title(commit.title) def category_output_name(category: str) -> str: cleaned = re.sub(r'[^0-9A-Za-z._-]+', '_', category.strip()) cleaned = cleaned.strip('._') return cleaned or 'Others' def get_markdown_header(category: str) -> str: return ( f'# Release Notes worksheet {category}\n' '- polish PR title to make it human read friendly.\n' '- edit, delete, merge multiple PRs.\n' '- summarize notes for this category.\n\n' ) def render_markdown( commits: list[CommitRow], base_ref: str, head_ref: str ) -> str: lines = [ '# Release Notes\n\n', f'- Range: `{base_ref}..{head_ref}`\n', f'- Commits: {len(commits)}\n\n', ] categories = ordered_values( (commit.category for commit in commits), PR_CATEGORIES ) for category in categories: lines.append(f'## {category}\n\n') category_commits = [ commit for commit in commits if commit.category == category ] topics = ordered_values( (commit.topic for commit in category_commits), PR_TYPES ) for topic in topics: topic_commits = [ commit for commit in category_commits if commit.topic == topic ] if not topic_commits: continue lines.append(f'### {topic}\n\n') for commit in topic_commits: lines.append( f'- {markdown_entry_text(commit)} ({get_hash_or_pr_url(commit)})\n' ) lines.append('\n') return ''.join(lines) def render_category_markdown(category: str, commits: list[CommitRow]) -> str: lines = [get_markdown_header(category), f'## {category}\n\n'] topics = ordered_values((commit.topic for commit in commits), PR_TYPES) for topic in topics: topic_commits = [commit for commit in commits if commit.topic == topic] if not topic_commits: continue lines.append(f'### {topic}\n\n') for commit in topic_commits: lines.append( f'- {markdown_entry_text(commit)} ({get_hash_or_pr_url(commit)})\n' ) lines.append('\n') return ''.join(lines) def write_csv(path: Path, rows: list[CommitRow]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open('w', newline='') as csv_file: writer = csv.writer(csv_file) writer.writerow(COMMIT_FIELDS) for row in rows: writer.writerow([getattr(row, field) for field in COMMIT_FIELDS]) def write_lines(path: Path, lines: Iterable[str]) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(''.join(f'{line}\n' for line in lines)) def write_category_exports( export_dir: Path, rows: list[CommitRow] ) -> list[Path]: export_dir.mkdir(parents=True, exist_ok=True) written_files: list[Path] = [] categories = ordered_values((row.category for row in rows), PR_CATEGORIES) for category in categories: category_rows = [row for row in rows if row.category == category] if not category_rows: continue category_name = category_output_name(category) category_dir = export_dir / category_name markdown_path = category_dir / f'result_{category_name}.md' csv_path = category_dir / f'result_{category_name}.csv' markdown_path.parent.mkdir(parents=True, exist_ok=True) markdown_path.write_text( render_category_markdown(category, category_rows) ) write_csv(csv_path, category_rows) written_files.extend([markdown_path, csv_path]) return written_files def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description='Generate release-note CSV and Markdown between two commits in one command.' ) parser.add_argument('base_ref', help='Base git ref or commit') parser.add_argument('head_ref', help='Head git ref or commit') parser.add_argument( '--output-dir', default='results', help='Directory for generated files (default: results)', ) parser.add_argument( '--owner', default=DEFAULT_OWNER, help=f'GitHub owner (default: {DEFAULT_OWNER})', ) parser.add_argument( '--repo', default=DEFAULT_REPO, help=f'GitHub repo (default: {DEFAULT_REPO})', ) parser.add_argument( '--token', help='GitHub token; falls back to env vars or ~/.gh_tokenrc', ) parser.add_argument( '--cache-path', default='results/pr_cache.json', help='PR metadata cache path (default: results/pr_cache.json)', ) parser.add_argument( '--batch-size', type=int, default=25, help='How many PRs to fetch per GitHub GraphQL request (default: 25)', ) parser.add_argument( '--workers', type=int, default=4, help='How many GraphQL batches to fetch concurrently (default: 4)', ) parser.add_argument( '--direct-range', action='store_true', help='Use base_ref..head_ref directly instead of merge-base(base_ref, head_ref)..head_ref', ) parser.add_argument( '--local-only', action='store_true', help='Skip GitHub API calls and only use local git metadata', ) return parser.parse_args() def main() -> int: args = parse_args() output_dir = Path(args.output_dir) csv_path = output_dir / 'commitlist.csv' export_dir = output_dir / 'export' contributors_path = output_dir / 'contributors.txt' commits = collect_commits( args.base_ref, args.head_ref, use_merge_base=not args.direct_range, ) if not commits: raise SystemExit( f'No commits found between {args.base_ref} and {args.head_ref}.' ) pull_requests: dict[int, PullRequestRecord] = {} pr_numbers = sorted( {commit.pr_number for commit in commits if commit.pr_number is not None} ) if args.local_only and pr_numbers: print( 'Warning: running in --local-only mode. PR metadata will not be fetched, ' 'so category/topic fall back to Others and descriptions stay empty.', file=sys.stderr, ) if not args.local_only and pr_numbers: token = load_token(args.token) if not token: print( 'Warning: GitHub token not found. Falling back to unauthenticated ' 'GitHub API requests; this may hit rate limits on large ranges.', file=sys.stderr, ) cache = PullRequestCache(Path(args.cache_path)) cached = cache.get_many(pr_numbers) missing = [ number for number in pr_numbers if number not in cached or cached[number].title == f'PR #{number}' ] fetched = GitHubClient( args.token or token, args.owner, args.repo ).fetch_pull_requests( missing, batch_size=max(1, args.batch_size), workers=max(1, args.workers), ) cache.update_many(fetched.values()) pull_requests = {**cached, **fetched} rows, missing_prs = build_commit_rows( commits, pull_requests, args.owner, args.repo ) contributors = sorted( {commit.git_author for commit in commits if commit.git_author}, key=str.casefold, ) write_csv(csv_path, rows) exported_files = write_category_exports(export_dir, rows) write_lines(contributors_path, contributors) if missing_prs and not args.local_only: print( 'Warning: failed to fetch metadata for PRs ' + ', '.join(f'#{number}' for number in missing_prs) + '. Fallback commit metadata was used.', file=sys.stderr, ) print(f'Generated {len(rows)} rows from {len(commits)} commits.') print(f'CSV: {csv_path}') print(f'Export dir: {export_dir}') print(f'Category exports: {len(exported_files)} files') print(f'Contributors: {contributors_path}') return 0 if __name__ == '__main__': raise SystemExit(main())