# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Extract docstrings from pyarrow runtime and insert them into stub files. Usage: python scripts/update_stub_docstrings.py """ import argparse import importlib import inspect import os import shutil import sys import tempfile from pathlib import Path from textwrap import indent import libcst from libcst import matchers as m def _resolve_object(module, path): """Resolve an object by dotted path from a module.""" if not path: return module, None, module.__name__ parts = path.split(".") parent = None obj = module for part in parts: parent = obj try: obj = getattr(obj, part) except AttributeError: try: obj = vars(parent).get(part) if obj is not None: continue except TypeError: pass return None, None, None return obj, parent, getattr(obj, "__name__", parts[-1]) def _get_docstring(name, module, indentation): """Extract and format a docstring for insertion into a stub file.""" obj, parent, obj_name = _resolve_object(module, name) if obj is None: print(f"{name} not found in {module.__name__}") return None docstring = inspect.getdoc(obj) if not docstring: return None # Remove signature prefix parent_name = getattr(parent, "__name__", None) if parent else None if docstring.startswith(obj_name) or ( parent_name and docstring.startswith(f"{parent_name}.{obj_name}") ): docstring = "\n".join(docstring.splitlines()[2:]) # Skip empty docstrings if not docstring.strip(): return None prefix = " " * indentation return '"""\n' + indent(docstring + '\n"""', prefix) class DocstringInserter(libcst.CSTTransformer): """CST transformer that inserts docstrings into stub file nodes.""" def __init__(self, module, namespace): self.module = module self.base_namespace = namespace self.stack = [] self.indentation = 0 def _full_name(self): name = ".".join(self.stack) return f"{self.base_namespace}.{name}" if self.base_namespace else name def leave_Module(self, original_node, updated_node): new_body = [] clone_matcher = m.SimpleStatementLine( body=[m.Assign(value=m.Call(func=m.Name(value="_clone_signature"))), m.ZeroOrMore()] ) for stmt in updated_node.body: new_body.append(stmt) if m.matches(stmt, clone_matcher): name = stmt.body[0].targets[0].target.value if self.base_namespace: name = f"{self.base_namespace}.{name}" docstring = _get_docstring(name, self.module, 0) if docstring: new_body.append(libcst.SimpleStatementLine( body=[libcst.Expr(value=libcst.SimpleString(docstring))])) return updated_node.with_changes(body=new_body) def visit_ClassDef(self, node): self.stack.append(node.name.value) self.indentation += 1 def leave_ClassDef(self, original_node, updated_node): name = self._full_name() docstring = _get_docstring(name, self.module, self.indentation) if docstring: ellipsis_class = m.ClassDef(body=m.IndentedBlock(body=[ m.SimpleStatementLine(body=[ m.Expr(m.Ellipsis()), m.ZeroOrMore()]), m.ZeroOrMore()])) func_class = m.ClassDef(body=m.IndentedBlock( body=[m.FunctionDef(), m.ZeroOrMore()])) if m.matches(updated_node, ellipsis_class): updated_node = updated_node.deep_replace( updated_node.body.body[0].body[0].value, libcst.SimpleString(value=docstring)) elif m.matches(updated_node, func_class): docstring_stmt = libcst.SimpleStatementLine( body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) updated_node = updated_node.with_changes( body=updated_node.body.with_changes( body=[docstring_stmt] + list(updated_node.body.body))) self.stack.pop() self.indentation -= 1 return updated_node def visit_FunctionDef(self, node): self.stack.append(node.name.value) self.indentation += 1 def leave_FunctionDef(self, original_node, updated_node): name = self._full_name() ellipsis_func = m.FunctionDef( body=m.SimpleStatementSuite(body=[m.Expr(m.Ellipsis())])) if m.matches(original_node, ellipsis_func): docstring = _get_docstring(name, self.module, self.indentation) if docstring: docstring_stmt = libcst.SimpleStatementLine( body=[libcst.Expr(value=libcst.SimpleString(value=docstring))]) updated_node = updated_node.with_changes( body=libcst.IndentedBlock(body=[docstring_stmt])) self.stack.pop() self.indentation -= 1 return updated_node LIB_MODULES = {"array", "builder", "compat", "config", "device", "error", "io", "_ipc", "memory", "pandas_shim", "scalar", "table", "tensor", "_types"} def add_docstrings_to_stubs(stubs_dir): """Update all stub files in stubs_dir with docstrings from pyarrow runtime.""" stubs_dir = Path(stubs_dir) print(f"Updating stub docstrings in: {stubs_dir}") pyarrow = importlib.import_module("pyarrow") for stub_file in sorted(stubs_dir.rglob('*.pyi')): if stub_file.name == "_stubs_typing.pyi": continue module_name = stub_file.stem if module_name in LIB_MODULES: namespace = "lib" elif stub_file.parent.name in ("parquet", "interchange"): namespace = (stub_file.parent.name if module_name == "__init__" else f"{stub_file.parent.name}.{module_name}") elif module_name == "__init__": namespace = "" else: namespace = module_name print(f" {stub_file.name} -> {namespace or '(root)'}") tree = libcst.parse_module(stub_file.read_text(encoding="utf-8")) modified = tree.visit(DocstringInserter(pyarrow, namespace)) stub_file.write_text(modified.code, encoding="utf-8") def _link_or_copy(source, destination): # Prefer symlinks (faster, no disk use) but fall back to copying when the # filesystem doesn't support them (e.g. Docker volumes, network mounts). if sys.platform != "win32": try: os.symlink(source, destination) return except OSError: pass if source.is_dir(): shutil.copytree(source, destination, symlinks=(sys.platform != "win32")) else: shutil.copy2(source, destination) def _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir): """ Assemble an importable pyarrow package inside a temporary directory. During wheel builds the .py sources and compiled binary artifacts live in separate trees (source checkout vs CMake install prefix). This function symlinks (or copies) both into pyarrow_pkg folder so that a plain ``import pyarrow`` works and docstrings can be extracted at build time. """ source_pyarrow = source_dir / "pyarrow" if not source_pyarrow.exists(): raise FileNotFoundError(f"PyArrow source package not found: {source_pyarrow}") for source_path in sorted(source_pyarrow.iterdir()): if source_path.suffix == ".py": _link_or_copy(source_path, pyarrow_pkg / source_path.name) elif source_path.is_dir() and not source_path.name.startswith((".", "__")): _link_or_copy(source_path, pyarrow_pkg / source_path.name) for artifact in sorted(install_pyarrow_dir.iterdir()): if not artifact.is_file() or artifact.suffix == ".pyi": continue destination = pyarrow_pkg / artifact.name if not destination.exists(): _link_or_copy(artifact, destination) if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("install_prefix", type=Path, help="CMAKE_INSTALL_PREFIX used by wheel build") parser.add_argument("source_dir", type=Path, help="PyArrow source directory") args = parser.parse_args() install_prefix = args.install_prefix.resolve() source_dir = args.source_dir.resolve() install_pyarrow_dir = install_prefix / "pyarrow" if not install_pyarrow_dir.exists(): install_pyarrow_dir = install_prefix if not any(install_pyarrow_dir.rglob("*.pyi")): print("No .pyi files found in install tree, skipping docstring injection") sys.exit(0) with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: pyarrow_pkg = Path(tmpdir) / "pyarrow" pyarrow_pkg.mkdir() _create_importable_pyarrow(pyarrow_pkg, source_dir, install_pyarrow_dir) sys.path.insert(0, tmpdir) try: add_docstrings_to_stubs(install_pyarrow_dir) finally: sys.path.pop(0)