Skip to content

Auto-Recognition

Let Pauhu figure it out. Automatic language detection across 176 languages with 99.7% accuracy. Script identification, dialect recognition, and mixed-language handling - all automatic.


Detection Accuracy

Category Accuracy Languages
EU Official Languages 99.9% 24
Major World Languages 99.7% 50
All Supported Languages 99.5% 176
Short Text (<20 chars) 95.0% 176
Mixed Languages 97.0% 176

Quick Detection

from pauhu import Pauhu

client = Pauhu()

# Detect language
result = client.detect("Hyvää päivää!")

print(result.language)      # "fi"
print(result.language_name) # "Finnish"
print(result.confidence)    # 0.997
print(result.script)        # "Latin"

Confidence Scoring

# Get confidence for all detected languages
result = client.detect(
    text="Hello world",
    return_all=True
)

for lang in result.languages:
    print(f"{lang.code}: {lang.confidence:.3f} ({lang.name})")

# Output:
# en: 0.987 (English)
# nl: 0.008 (Dutch)
# de: 0.003 (German)
# fr: 0.001 (French)

Script Detection

# Detect writing script
result = client.detect("Привет мир")

print(result.script)        # "Cyrillic"
print(result.language)      # "ru"
print(result.script_confidence)  # 0.999

Supported Scripts

Script Languages Example
Latin English, Finnish, German... Hello
Cyrillic Russian, Ukrainian, Bulgarian Привет
Arabic Arabic, Persian, Urdu مرحبا
Hebrew Hebrew, Yiddish שלום
Greek Greek Γειά
Devanagari Hindi, Sanskrit, Marathi नमस्ते
Chinese Mandarin, Cantonese 你好
Japanese Japanese こんにちは
Korean Korean 안녕하세요
Thai Thai สวัสดี

Dialect Recognition

# Detect dialects and variants
result = client.detect(
    text="I'm going to the shopping centre",
    detect_dialect=True
)

print(result.language)  # "en"
print(result.dialect)   # "en-GB"
print(result.variant)   # "British English"

Regional Variants

Language Variants Detected
English US, UK, AU, CA, IE, NZ, ZA, IN
Spanish ES, MX, AR, CO, CL
Portuguese PT, BR
Chinese Simplified, Traditional
Arabic MSA, Egyptian, Gulf, Levantine
German DE, AT, CH
French FR, CA, BE, CH

Mixed Language Detection

# Detect code-switching / mixed languages
result = client.detect(
    text="That was todella hyvä meeting yesterday",
    detect_mixed=True
)

print(result.is_mixed)  # True
print(result.languages) # ["en", "fi"]
print(result.segments)
# [
#   {"text": "That was", "language": "en"},
#   {"text": "todella hyvä", "language": "fi"},
#   {"text": "meeting yesterday", "language": "en"}
# ]

Batch Detection

# Detect multiple texts at once
texts = [
    "Hello, how are you?",
    "Bonjour, comment allez-vous?",
    "Hallo, wie geht es Ihnen?",
    "Hei, mitä kuuluu?"
]

results = client.detect_batch(texts)

for text, result in zip(texts, results):
    print(f"{result.language}: {text}")

# Output:
# en: Hello, how are you?
# fr: Bonjour, comment allez-vous?
# de: Hallo, wie geht es Ihnen?
# fi: Hei, mitä kuuluu?

Auto-Translate

# Detect and translate in one call
result = client.translate(
    text="Tervetuloa Suomeen!",
    source="auto",  # Auto-detect source language
    target="en"
)

print(result.detected_language)  # "fi"
print(result.translation)        # "Welcome to Finland!"

Document Language Detection

# Detect language of entire document
result = client.detect_document(
    file_path="mystery_document.pdf"
)

print(result.primary_language)  # "de"
print(result.languages)
# [
#   {"language": "de", "percentage": 85.0},
#   {"language": "en", "percentage": 15.0}
# ]
print(result.page_languages)
# [
#   {"page": 1, "language": "de"},
#   {"page": 2, "language": "de"},
#   {"page": 3, "language": "en"}  # English appendix
# ]

Edge Cases

Very Short Text

# Short text detection
result = client.detect("OK")

print(result.language)    # "en"
print(result.confidence)  # 0.65 (lower confidence)
print(result.is_uncertain)  # True

Ambiguous Text

# Numbers and symbols only
result = client.detect("123-456-7890")

print(result.is_uncertain)  # True
print(result.reason)        # "numeric_only"

Names and Proper Nouns

# Proper nouns may be ambiguous
result = client.detect(
    text="Nokia Oyj",
    context_hint="company_name"
)

print(result.language)  # "fi"

API Endpoint

# REST API detection
curl -X POST https://api.pauhu.ai/v1/detect \
  -H "Authorization: Bearer pk_..." \
  -H "Content-Type: application/json" \
  -d '{"text": "Bonjour le monde"}'

# Response
{
  "language": "fr",
  "language_name": "French",
  "confidence": 0.998,
  "script": "Latin",
  "processing_time_ms": 5
}

Performance

Text Length Latency Accuracy
1-10 chars <5ms 90%
10-50 chars <5ms 97%
50-200 chars <10ms 99%
200+ chars <15ms 99.7%

Getting Started

from pauhu import Pauhu

client = Pauhu()

# Simple detection
result = client.detect("Hyvää huomenta!")

print(f"Language: {result.language}")
print(f"Confidence: {result.confidence:.1%}")

# Output:
# Language: fi
# Confidence: 99.8%