# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import itertools import os import re from collections import namedtuple from pyspark.java_gateway import launch_gateway ExpressionInfo = namedtuple( "ExpressionInfo", "className name usage arguments examples note since deprecated group") def _make_anchor(name): """ Convert function name to a valid HTML anchor. Special characters are converted to descriptive names. Parameters: name (str): The function name. Returns: str: A valid HTML anchor string. """ # Map special characters to descriptive names special_chars = { '!': 'not', '!=': 'notequal', '<>': 'notequal2', '<': 'lt', '<=': 'lte', '<=>': 'nullsafeequal', '=': 'eq', '==': 'equal', '>': 'gt', '>=': 'gte', '&': 'bitand', '|': 'bitor', '^': 'bitxor', '~': 'bitnot', '<<': 'shiftleft', '>>': 'shiftright', '>>>': 'shiftrightunsigned', '+': 'plus', '-': 'minus', '*': 'multiply', '/': 'divide', '%': 'mod', '||': 'concat', } if name in special_chars: return special_chars[name] # For regular names, convert to lowercase and replace spaces with hyphens # Remove any remaining special characters anchor = name.lower().replace(" ", "-") anchor = re.sub(r'[^a-z0-9_-]', '', anchor) return anchor def _get_display_name(group): """ Convert group name to display name. Parameters: group (str): The group name (e.g., "agg_funcs", "window_funcs"). Returns: str: The display name (e.g., "Agg Functions", "Window Functions"). """ if group is None or group == "": return "Misc Functions" # Replace _funcs suffix, replace underscores with spaces, and title case name = group.replace("_funcs", "").replace("_", " ").title() return "%s Functions" % name def _get_file_name(group): """ Convert group name to file name. Parameters: group (str): The group name (e.g., "agg_funcs", "window_funcs", "operator"). Returns: str: The file name (e.g., "agg-functions", "window-functions", "operator-functions"). """ if group is None or group == "": return "misc-functions" # Replace _funcs with -functions, replace underscores with hyphens file_name = group.replace("_funcs", "-functions").replace("_", "-") # If the group doesn't end with _funcs, append -functions if not group.endswith("_funcs") and not file_name.endswith("-functions"): file_name = file_name + "-functions" return file_name # Groups that should be merged into other groups GROUP_MERGES = { "lambda_funcs": "collection_funcs", # SPARK-45232 } _virtual_operator_infos = [ ExpressionInfo( className="", name="!=", usage="expr1 != expr2 - Returns true if `expr1` is not equal to `expr2`, " + "or false otherwise.", arguments="\n Arguments:\n " + """* expr1, expr2 - the two expressions must be same type or can be casted to a common type, and must be a type that can be used in equality comparison. Map type is not supported. For complex types such array/struct, the data types of fields must be orderable.""", examples="\n Examples:\n " + "> SELECT 1 != 2;\n " + " true\n " + "> SELECT 1 != '2';\n " + " true\n " + "> SELECT true != NULL;\n " + " NULL\n " + "> SELECT NULL != NULL;\n " + " NULL", note="", since="1.0.0", deprecated="", group="predicate_funcs"), ExpressionInfo( className="", name="<>", usage="expr1 != expr2 - Returns true if `expr1` is not equal to `expr2`, " + "or false otherwise.", arguments="\n Arguments:\n " + """* expr1, expr2 - the two expressions must be same type or can be casted to a common type, and must be a type that can be used in equality comparison. Map type is not supported. For complex types such array/struct, the data types of fields must be orderable.""", examples="\n Examples:\n " + "> SELECT 1 != 2;\n " + " true\n " + "> SELECT 1 != '2';\n " + " true\n " + "> SELECT true != NULL;\n " + " NULL\n " + "> SELECT NULL != NULL;\n " + " NULL", note="", since="1.0.0", deprecated="", group="predicate_funcs"), ExpressionInfo( className="", name="case", usage="CASE expr1 WHEN expr2 THEN expr3 " + "[WHEN expr4 THEN expr5]* [ELSE expr6] END - " + "When `expr1` = `expr2`, returns `expr3`; " + "when `expr1` = `expr4`, return `expr5`; else return `expr6`.", arguments="\n Arguments:\n " + "* expr1 - the expression which is one operand of comparison.\n " + "* expr2, expr4 - the expressions each of which is the other " + " operand of comparison.\n " + "* expr3, expr5, expr6 - the branch value expressions and else value expression" + " should all be same type or coercible to a common type.", examples="\n Examples:\n " + "> SELECT CASE col1 WHEN 1 THEN 'one' " + "WHEN 2 THEN 'two' ELSE '?' END FROM VALUES 1, 2, 3;\n " + " one\n " + " two\n " + " ?\n " + "> SELECT CASE col1 WHEN 1 THEN 'one' " + "WHEN 2 THEN 'two' END FROM VALUES 1, 2, 3;\n " + " one\n " + " two\n " + " NULL", note="", since="1.0.1", deprecated="", group="conditional_funcs"), ExpressionInfo( className="", name="||", usage="expr1 || expr2 - Returns the concatenation of `expr1` and `expr2`.", arguments="", examples="\n Examples:\n " + "> SELECT 'Spark' || 'SQL';\n " + " SparkSQL\n " + "> SELECT array(1, 2, 3) || array(4, 5) || array(6);\n " + " [1,2,3,4,5,6]", note="\n || for arrays is available since 2.4.0.\n", since="2.3.0", deprecated="", group="string_funcs") ] def _list_function_infos(jvm): """ Returns a list of function information via JVM. Sorts wrapped expression infos by name and returns them. """ jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos() infos = list(_virtual_operator_infos) # Make a copy for jinfo in jinfos: name = jinfo.getName() usage = jinfo.getUsage() usage = usage.replace("_FUNC_", name) if usage is not None else usage # Get the group and apply any merges group = jinfo.getGroup() group = GROUP_MERGES.get(group, group) infos.append(ExpressionInfo( className=jinfo.getClassName(), name=name, usage=usage, arguments=jinfo.getArguments().replace("_FUNC_", name), examples=jinfo.getExamples().replace("_FUNC_", name), note=jinfo.getNote().replace("_FUNC_", name), since=jinfo.getSince(), deprecated=jinfo.getDeprecated(), group=group)) return sorted(infos, key=lambda i: i.name) def _list_grouped_function_infos(jvm): """ Returns a list of function information grouped by category. Each item is a tuple of (group_key, list_of_infos). """ infos = _list_function_infos(jvm) # Group by category grouped = itertools.groupby( sorted(infos, key=lambda x: (x.group or "", x.name)), key=lambda x: x.group ) return [(k, list(g)) for k, g in grouped] def _make_pretty_usage(usage): """ Makes the usage description pretty and returns a formatted string if `usage` is not an empty string. Otherwise, returns None. """ if usage is not None and usage.strip() != "": usage = "\n".join(map(lambda u: u.strip(), usage.split("\n"))) return "%s\n\n" % usage def _make_pretty_arguments(arguments): """ Makes the arguments description pretty and returns a formatted string if `arguments` starts with the argument prefix. Otherwise, returns None. Expected input: Arguments: * arg0 - ... ... * arg0 - ... ... Expected output: **Arguments:** * arg0 - ... ... * arg0 - ... ... """ if arguments.startswith("\n Arguments:"): arguments = "\n".join(map(lambda u: u[6:], arguments.strip().split("\n")[1:])) return "**Arguments:**\n\n%s\n\n" % arguments def _make_pretty_examples(examples): """ Makes the examples description pretty and returns a formatted string if `examples` starts with the example prefix. Otherwise, returns None. Expected input: Examples: > SELECT ...; ... > SELECT ...; ... Expected output: **Examples:** ``` > SELECT ...; ... > SELECT ...; ... ``` """ if examples.startswith("\n Examples:"): examples = "\n".join(map(lambda u: u[6:], examples.strip().split("\n")[1:])) return "**Examples:**\n\n```\n%s\n```\n\n" % examples def _make_pretty_note(note): """ Makes the note description pretty and returns a formatted string if `note` is not an empty string. Otherwise, returns None. Expected input: ... Expected output: **Note:** ... """ if note != "": note = "\n".join(map(lambda n: n[4:], note.split("\n"))) return "**Note:**\n%s\n" % note def _make_pretty_deprecated(deprecated): """ Makes the deprecated description pretty and returns a formatted string if `deprecated` is not an empty string. Otherwise, returns None. Expected input: ... Expected output: **Deprecated:** ... """ if deprecated != "": deprecated = "\n".join(map(lambda n: n[4:], deprecated.split("\n"))) return "**Deprecated:**\n%s\n" % deprecated def generate_sql_api_markdown(jvm, docs_dir): """ Generates markdown files after listing the function information. Creates one file per category plus an index file. Also generates mkdocs.yml with auto-generated navigation. Expected output for each category file: # Category Name ### NAME USAGE **Arguments:** ARGUMENTS **Examples:** ``` EXAMPLES ``` **Note:** NOTE **Since:** SINCE **Deprecated:** DEPRECATED
""" def _write_function_entry(mdfile, info): """Write a single function entry to the markdown file.""" name = info.name anchor = _make_anchor(name) usage = _make_pretty_usage(info.usage) arguments = _make_pretty_arguments(info.arguments) examples = _make_pretty_examples(info.examples) note = _make_pretty_note(info.note) since = info.since deprecated = _make_pretty_deprecated(info.deprecated) # Use explicit anchor for special characters mdfile.write('\n\n' % anchor) mdfile.write("### %s\n\n" % name) if usage is not None: mdfile.write("%s\n\n" % usage.strip()) if arguments is not None: mdfile.write(arguments) if examples is not None: mdfile.write(examples) if note is not None: mdfile.write(note) if since is not None and since != "": mdfile.write("**Since:** %s\n\n" % since.strip()) if deprecated is not None: mdfile.write(deprecated) mdfile.write("
\n\n") # Group functions by category grouped_infos = _list_grouped_function_infos(jvm) # Track categories that have functions for the index categories_with_functions = [] # Generate a separate markdown file for each category for group_key, infos in grouped_infos: display_name = _get_display_name(group_key) file_name = _get_file_name(group_key) categories_with_functions.append((group_key, display_name, file_name, len(infos))) category_path = os.path.join(docs_dir, "%s.md" % file_name) with open(category_path, 'w') as mdfile: mdfile.write("# %s\n\n" % display_name) mdfile.write("This page lists all %s available in Spark SQL.\n\n" % display_name.lower()) mdfile.write("---\n\n") for info in infos: _write_function_entry(mdfile, info) # Generate the index file with links to all categories index_path = os.path.join(docs_dir, "index.md") with open(index_path, 'w') as mdfile: mdfile.write("# Built-in Functions\n\n") # Inline CSS for responsive grid layout css = """ """ mdfile.write(css) mdfile.write("Spark SQL provides a comprehensive set of built-in functions for data ") mdfile.write("manipulation and analysis. Functions are organized into the following ") mdfile.write("categories:\n\n") # Sort categories by display name for consistent ordering sorted_categories = sorted(categories_with_functions, key=lambda x: x[1]) # Create dictionary for efficient lookup grouped_dict = {k: infos for k, infos in grouped_infos} # Generate detailed TOC for each category with all function names for group_key, display_name, file_name, count in sorted_categories: mdfile.write("## %s (%d)\n\n" % (display_name, count)) # Get the functions for this category category_infos = grouped_dict.get(group_key, []) # Write function links in a responsive grid layout mdfile.write('
\n') for info in category_infos: anchor = _make_anchor(info.name) mdfile.write('%s\n' % (file_name, anchor, info.name)) mdfile.write('
\n\n') # Auto-generate mkdocs.yml with navigation _generate_mkdocs_yml(docs_dir, categories_with_functions) def _generate_mkdocs_yml(docs_dir, categories_with_functions): """ Generate mkdocs.yml with auto-generated navigation based on function categories. Parameters: docs_dir (str): The docs directory path. categories_with_functions (list): List of tuples (group_key, display_name, file_name, count). """ # mkdocs.yml is in the parent directory of docs mkdocs_path = os.path.join(os.path.dirname(docs_dir), "mkdocs.yml") # Sort categories by display name for consistent ordering sorted_categories = sorted(categories_with_functions, key=lambda x: x[1]) with open(mkdocs_path, 'w') as f: f.write("# AUTO-GENERATED FILE - DO NOT EDIT MANUALLY\n") f.write("# This file is generated by gen-sql-api-docs.py\n") f.write("# Run 'sql/create-docs.sh' to regenerate\n") f.write("\n") f.write("site_name: Spark SQL, Built-in Functions\n") f.write("theme:\n") f.write(" name: readthedocs\n") f.write(" navigation_depth: 3\n") f.write(" collapse_navigation: true\n") f.write("nav:\n") f.write(" - 'Overview': 'index.md'\n") for group_key, display_name, file_name, count in sorted_categories: f.write(" - '%s': '%s.md'\n" % (display_name, file_name)) f.write("markdown_extensions:\n") f.write(" - toc:\n") f.write(" anchorlink: True\n") f.write(" permalink: True\n") f.write(" - tables\n") if __name__ == "__main__": jvm = launch_gateway().jvm spark_root_dir = os.path.dirname(os.path.dirname(__file__)) docs_dir = os.path.join(spark_root_dir, "sql/docs") # Create docs directory if it doesn't exist os.makedirs(docs_dir, exist_ok=True) generate_sql_api_markdown(jvm, docs_dir)