Skip to content

Document Intelligence

Documents in, documents out. Pauhu understands document structure, preserves formatting, and reconstructs translations with pixel-perfect accuracy. Tables, headers, footnotes - everything stays in place.


Supported Formats

Category Formats Preservation
Office DOCX, XLSX, PPTX, ODT, ODS Full formatting
PDF PDF, PDF/A, PDF/UA Layout + accessibility
Text TXT, RTF, HTML, XML Structure
Publishing InDesign, QuarkXPress Professional layout
CAD AutoCAD, SolidWorks Technical drawings
Legal Contracts, filings Clause structure
Medical HL7, FHIR, CDA Clinical standards
Subtitles SRT, VTT, ASS Timing preserved

How It Works

graph LR
    A[Input Document] --> B[Structure Analysis]
    B --> C[Text Extraction]
    C --> D[Translation Engine]
    D --> E[Reconstruction]
    E --> F[Output Document]

    B --> G[Layout Map]
    G --> E

    C --> H[Style Extraction]
    H --> E

Python SDK

Translate a Document

from pauhu import Pauhu

client = Pauhu()

# Translate PDF document
result = client.translate_document(
    file_path="contract.pdf",
    target="fi",
    preserve_layout=True
)

# Save translated document
result.save("contract_fi.pdf")

# Check what was preserved
print(result.metadata.preserved)
# ['tables', 'headers', 'footers', 'images', 'fonts']

Stream Large Documents

# For documents >100 pages, use streaming
async for page in client.translate_document_stream(
    file_path="annual_report.pdf",
    target="fi"
):
    print(f"Page {page.number}: {page.word_count} words")
    page.save(f"annual_report_fi_page_{page.number}.pdf")

Layout Preservation

Tables

# Tables maintain cell structure and formatting
result = client.translate_document(
    file_path="financial_report.xlsx",
    target="fi",
    preserve={
        "cell_formatting": True,
        "formulas": True,
        "charts": True,
        "merged_cells": True
    }
)

Headers & Footers

# Headers/footers translated separately
result = client.translate_document(
    file_path="legal_brief.docx",
    target="fi",
    translate_headers=True,
    translate_footers=True,
    preserve_page_numbers=True
)

Images with Text

# OCR and translate text in images
result = client.translate_document(
    file_path="brochure.pdf",
    target="fi",
    ocr_images=True,  # Extract text from images
    recreate_images=True  # Overlay translated text
)

Document Types

# Specialized handling for contracts
result = client.translate_document(
    file_path="contract.pdf",
    target="fi",
    domain="12 Law",
    document_type="contract",
    preserve={
        "clause_numbers": True,
        "definitions": True,
        "cross_references": True
    }
)

Financial Reports

# Handle financial tables and figures
result = client.translate_document(
    file_path="q4_report.xlsx",
    target="fi",
    domain="24 Finance",
    preserve={
        "number_formats": True,  # EUR 1,234.56 -> 1 234,56 EUR
        "formulas": True,
        "conditional_formatting": True
    }
)

Technical Manuals

# Preserve technical diagrams and references
result = client.translate_document(
    file_path="user_manual.docx",
    target="fi",
    domain="64 Production",
    preserve={
        "diagrams": True,
        "callouts": True,
        "index": True,
        "cross_references": True
    }
)

Batch Processing

import os
from pauhu import Pauhu

client = Pauhu()

# Process entire folder
input_folder = "/docs/english"
output_folder = "/docs/finnish"

results = client.translate_documents(
    input_folder=input_folder,
    output_folder=output_folder,
    target="fi",
    recursive=True,
    parallel=True  # Use all CPU cores
)

for result in results:
    print(f"{result.filename}: {result.status}")

Quality Assurance

Automated Checks

result = client.translate_document(
    file_path="document.pdf",
    target="fi",
    quality_checks=[
        "completeness",      # All text translated
        "formatting",        # Layout preserved
        "terminology",       # Consistent terms
        "numbers",           # Numbers unchanged
        "placeholders"       # Tags preserved
    ]
)

# Review issues
for issue in result.quality_issues:
    print(f"Page {issue.page}: {issue.type} - {issue.message}")

Side-by-Side Comparison

# Generate comparison document
result.generate_comparison(
    output_path="comparison.pdf",
    highlight_changes=True
)

API Endpoints

REST API

# Upload and translate
curl -X POST https://api.pauhu.ai/v1/documents \
  -H "Authorization: Bearer pk_..." \
  -F "file=@contract.pdf" \
  -F "target=fi" \
  -F "preserve_layout=true"

# Check status
curl https://api.pauhu.ai/v1/documents/{job_id}/status \
  -H "Authorization: Bearer pk_..."

# Download result
curl https://api.pauhu.ai/v1/documents/{job_id}/download \
  -H "Authorization: Bearer pk_..." \
  -o contract_fi.pdf

Performance

Document Type Pages Time Accuracy
Contract (PDF) 20 15s 99.8%
Annual Report (PDF) 100 45s 99.5%
User Manual (DOCX) 50 25s 99.7%
Spreadsheet (XLSX) 10 sheets 10s 99.9%

Getting Started

from pauhu import Pauhu

client = Pauhu()

# Translate your first document
result = client.translate_document(
    file_path="document.pdf",
    target="fi"
)

result.save("document_fi.pdf")
print(f"Translated {result.page_count} pages")