Files
Maxim Lysak 1c74a9b9c7 feat: Implementation of HTML backend with headless browser (#2969)
- Implementation of HTML backend that (optionally) uses headless browser (via Playwright) to materialize HTML pages into images, and add provenances with bboxes to all elements in the converted docling document.
- Conversion preserves reading order given by HTML DOM tree
- Added support for HTML "input" fields: checkboxes, radiobuttons, text inputs, etc.
- Added support to Key-Value convention in HTML (i.e. elements with id "key1" and "key1_value1" will be paired as key-values, see test cases as examples)
- Heuristic that glues independent inline HTML elements with single-character text in them into larger text blocks
- Support for inline styling (bold, italic, etc.)

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2026-03-24 14:28:57 +01:00

454 lines
16 KiB
YAML

on:
workflow_call:
inputs:
push_coverage:
type: boolean
description: "If true, the coverage results are pushed to codecov.io."
default: true
secrets:
CODECOV_TOKEN:
required: false
env:
HF_HUB_DOWNLOAD_TIMEOUT: "90"
HF_HUB_ETAG_TIMEOUT: "90"
UV_FROZEN: "1"
PYTEST_ML: |-
tests/test_e2e_conversion.py
tests/test_e2e_ocr_conversion.py
tests/test_backend_webp.py
tests/test_asr_pipeline.py
tests/test_threaded_pipeline.py
PYTEST_TO_SKIP: |-
EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|suryaocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping|mlx_whisper_example|gpu_standard_pipeline|gpu_vlm_pipeline|demo_layout_vlm|post_process_ocr_with_vlm|run_with_formats_html_rendered|run_with_formats_html_rendered_mp)\.py$|xbrl_conversion\.ipynb$'
jobs:
lint:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@v6
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
enable-cache: true
- name: Set pre-commit cache key
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
- name: Cache pre-commit environments
uses: actions/cache@v5
with:
path: ~/.cache/pre-commit
key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
restore-keys: |
pre-commit|${{ env.PY }}|
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
- name: Check style
run: |
echo "--- Running pre-commit style checks ---"
uv run pre-commit run --all-files
run-tests-1:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v6
- name: Grant permissions to APT cache directory # allows restore
run: sudo chown -R $USER:$USER /var/cache/apt/archives
- name: Cache APT packages
id: apt-cache
uses: actions/cache@v5
with:
path: /var/cache/apt/archives
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
restore-keys: |
apt-packages-${{ runner.os }}-
- name: Install System Dependencies
run: |
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
- name: Cache Models
uses: actions/cache@v5
with:
path: |
~/.cache/huggingface
~/.cache/modelscope
~/.EasyOCR/
key: models-cache
- name: Pre-download Models
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
- name: Run tests for GROUP1
run: |
echo "--- Running tests ---"
GROUP1=$(echo "$PYTEST_ML" | sed -e 's/^/--ignore=/' | tr '\n' ' ')
echo "Running tests for GROUP1"
uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP1
- name: Upload coverage to Codecov
if: inputs.push_coverage
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: run-tests-1
- name: Grant permissions to APT cache directory # allows backup
run: sudo chown -R $USER:$USER /var/cache/apt/archives
run-tests-2:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v6
- name: Grant permissions to APT cache directory # allows restore
run: sudo chown -R $USER:$USER /var/cache/apt/archives
- name: Cache APT packages
id: apt-cache
uses: actions/cache@v5
with:
path: /var/cache/apt/archives
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
restore-keys: |
apt-packages-${{ runner.os }}-
- name: Install System Dependencies
run: |
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
- name: Cache Models
uses: actions/cache@v5
with:
path: |
~/.cache/huggingface
~/.cache/modelscope
~/.EasyOCR/
key: models-cache
- name: Pre-download Models
run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"
- name: Run tests for GROUP2
run: |
echo "--- Running tests ---"
GROUP2=$(echo "$PYTEST_ML" | tr '\n' ' ')
echo "Running tests for GROUP2"
DESELECT_OPT=""
if [ -n "$PYTEST_TO_SKIP" ]; then
DESELECT_OPT="--deselect $PYTEST_TO_SKIP"
fi
echo "Running tests for GROUP2"
uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP2 $DESELECT_OPT
- name: Upload coverage to Codecov
if: inputs.push_coverage
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: run-tests-2
- name: Grant permissions to APT cache directory # allows backup
run: sudo chown -R $USER:$USER /var/cache/apt/archives
run-examples:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
steps:
- uses: actions/checkout@v6
- name: Grant permissions to APT cache directory # allows restore
run: sudo chown -R $USER:$USER /var/cache/apt/archives
- name: Cache APT packages
id: apt-cache
uses: actions/cache@v5
with:
path: /var/cache/apt/archives
key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
restore-keys: |
apt-packages-${{ runner.os }}-
- name: Install System Dependencies
run: |
sudo apt-get -qq update
sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config
- name: Set TESSDATA_PREFIX
run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Dependencies
run: uv sync --frozen --all-extras
- name: Cache Models
uses: actions/cache@v5
with:
path: |
~/.cache/huggingface
~/.cache/modelscope
~/.EasyOCR/
key: models-cache
- name: Free up disk space
run: |
df -h
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo apt-get clean
df -h
- name: Run examples
run: |
echo "--- Creating output directory ---"
mkdir -p scratch
echo "--- Running examples ---"
summary_file="runtime_summary.log"
echo "--- Example Runtimes ---" > "$summary_file"
for file in docs/examples/*.py; do
if [[ "$(basename "$file")" =~ ${EXAMPLES_TO_SKIP} ]]; then
echo "Skipping example: $(basename "$file")"
else
echo "--- Running example $(basename "$file") ---"
start_time=$SECONDS
uv run --no-sync python "$file" || exit 1
duration=$((SECONDS - start_time))
echo "Finished in ${duration}s."
echo "$(basename "$file"): ${duration}s" >> "$summary_file"
fi
done
echo
echo "==================================="
echo " Final Runtime Summary "
echo "==================================="
cat "$summary_file"
echo "==================================="
- name: Grant permissions to APT cache directory # allows backup
run: sudo chown -R $USER:$USER /var/cache/apt/archives
test-pip-install-no-lock:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Test pip install across Python versions
run: |
for py_version in 3.10 3.11 3.12 3.13 3.14; do
echo "=========================================="
echo "Testing Python $py_version"
echo "=========================================="
# Create virtual environment with uv
uv venv /tmp/venv-${py_version} --python=${py_version}
source /tmp/venv-${py_version}/bin/activate
# Install package with pip (no lock file)
uv pip install --torch-backend=cpu -e .[easyocr,tesserocr,vlm,rapidocr,asr]
# Run basic import test
python -c "import docling; from docling.document_converter import DocumentConverter; print('Import successful for Python ${py_version}')"
# Cleanup
deactivate
rm -rf /tmp/venv-${py_version}
echo "Python $py_version: PASSED"
echo ""
done
test-pip-install-no-dev-headers:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Test pip install without dev headers across Python versions
run: |
for py_version in 3.10 3.11 3.12 3.13 3.14; do
echo "=========================================="
echo "Testing Python $py_version (no dev headers)"
echo "=========================================="
# Create virtual environment with uv
uv venv /tmp/venv-nodev-${py_version} --python=${py_version}
source /tmp/venv-nodev-${py_version}/bin/activate
# Find and remove Python.h from the Python installation
echo "Removing Python development headers from Python installation..."
python_include_dir=$(python -c "import sysconfig; print(sysconfig.get_path('include'))")
echo "Python include directory: $python_include_dir"
if [ -f "$python_include_dir/Python.h" ]; then
echo "Found Python.h, removing it and other headers..."
# Use sudo if the directory is system-owned
if [ -w "$python_include_dir" ]; then
rm -rf "$python_include_dir"/*
else
sudo rm -rf "$python_include_dir"/*
fi
echo "✓ Headers removed"
else
echo "Warning: Python.h not found at expected location"
fi
# Verify that compilation fails without dev headers
# Try to install numpy from source (sdist) - this should fail
echo "Verifying compilation fails without dev headers..."
set +e # Temporarily allow command to fail
uv pip install --no-binary=:all: numpy==1.26.4 > /tmp/numpy-install-${py_version}.log 2>&1
numpy_exit_code=$?
set -e # Re-enable exit on error
if [ $numpy_exit_code -eq 0 ]; then
echo "ERROR: numpy installation from source succeeded, but it should have failed without dev headers!"
cat /tmp/numpy-install-${py_version}.log
exit 1
else
echo "✓ Compilation correctly failed without dev headers (expected behavior)"
# Check that the error mentions missing Python.h or similar
if grep -qi "Python.h\|fatal error.*\.h" /tmp/numpy-install-${py_version}.log; then
echo "✓ Error message confirms missing development headers"
else
echo "Warning: Error message doesn't explicitly mention missing headers, but compilation failed as expected"
fi
fi
# Install package with pip (no lock file, no compilation)
# Install without extras that require compilation (tesserocr requires dev headers)
# Note: Not using --only-binary since some packages are sdist-only but don't require compilation
uv pip install --torch-backend=cpu -e .[easyocr,vlm,rapidocr,asr]
# Run basic import test
python -c "import docling; from docling.document_converter import DocumentConverter; print('Import successful for Python ${py_version} without dev headers')"
# Cleanup
deactivate
rm -rf /tmp/venv-nodev-${py_version}
echo "Python $py_version (no dev headers): PASSED"
echo ""
done
build-package:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- uses: actions/checkout@v6
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras
- name: Build package
run: uv build
- name: Check content of wheel
run: unzip -l dist/*.whl
- name: Store the distribution packages
uses: actions/upload-artifact@v6
with:
name: python-package-distributions
path: dist/
test-package:
needs:
- build-package
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]
steps:
- name: Download all the dists
uses: actions/download-artifact@v7
with:
name: python-package-distributions
path: dist/
- name: Install uv and set the python version
uses: astral-sh/setup-uv@v7
with:
python-version: ${{ matrix.python-version }}
activate-environment: true
enable-cache: false
- name: Install package
run: |
uv pip install dist/*.whl
- name: Run docling
run: uv run docling --help