Skip to content

Multi-Modal Translation

Beyond text. Pauhu processes images, audio, and video with the same precision as text. Extract text from images, transcribe speech, synthesize translated audio, and subtitle videos.


Modality Matrix

Input Output Use Case
Text Text Standard translation
Text Audio Text-to-speech
Image Text OCR + translation
Image Image Overlay translation
Audio Text Transcription + translation
Audio Audio Speech translation
Video Subtitles Video translation
Video Audio Dubbing

Image Translation (OCR)

Extract and Translate

from pauhu import Pauhu

client = Pauhu()

# Translate text in image
result = client.translate_image(
    image_path="sign.jpg",
    target="fi"
)

print(result.extracted_text)  # "Exit"
print(result.translation)      # "Uloskäynti"
print(result.confidence)       # 0.98

Overlay Translation

# Create translated image with text overlay
result = client.translate_image(
    image_path="infographic.png",
    target="fi",
    output="overlay"  # Replace text in image
)

result.save("infographic_fi.png")

Batch Image Processing

# Process folder of images
results = client.translate_images(
    folder_path="/images/english",
    target="fi",
    output_folder="/images/finnish",
    formats=["png", "jpg", "tiff"]
)

Audio Translation

Speech-to-Speech

from pauhu import Pauhu

client = Pauhu()

# Translate audio file
result = client.translate_audio(
    audio_path="meeting.mp3",
    source="en",
    target="fi",
    output="audio"  # Generate Finnish audio
)

result.save("meeting_fi.mp3")

# Access intermediate results
print(result.transcript)      # Original English text
print(result.translation)     # Finnish text
print(result.duration_ms)     # Audio duration

Real-Time Speech Translation

from pauhu.realtime import AudioStream

client = Pauhu()

async def translate_live():
    audio = AudioStream(device="microphone")

    async for segment in audio.transcribe_and_translate(
        source="en",
        target="fi",
        output="audio"
    ):
        # Play translated audio in real-time
        segment.play()

        # Or access text
        print(f"EN: {segment.source_text}")
        print(f"FI: {segment.target_text}")

Text-to-Speech

# Generate Finnish speech from English text
result = client.text_to_speech(
    text="Welcome to Finland",
    source="en",
    target="fi",
    voice="female_1"  # Finnish voice
)

result.save("welcome_fi.mp3")

Video Translation

Generate Subtitles

from pauhu import Pauhu

client = Pauhu()

# Generate translated subtitles
result = client.translate_video(
    video_path="presentation.mp4",
    target="fi",
    output="subtitles"
)

# Save SRT file
result.save_subtitles("presentation_fi.srt")

# Embed subtitles
result.embed_subtitles(
    output_path="presentation_subtitled.mp4"
)

Dubbing

# Full audio dubbing
result = client.translate_video(
    video_path="training.mp4",
    target="fi",
    output="dubbed",
    preserve_music=True,  # Keep background music
    voice_cloning=True    # Match original speaker
)

result.save("training_fi.mp4")

Multi-Language Subtitles

# Generate subtitles for multiple languages
targets = ["fi", "sv", "de", "fr"]

for lang in targets:
    result = client.translate_video(
        video_path="product_demo.mp4",
        target=lang,
        output="subtitles"
    )
    result.save_subtitles(f"product_demo_{lang}.srt")

Voice Options

Finnish Voices

Voice ID Gender Style Quality
fi_female_1 Female Professional Neural
fi_female_2 Female Casual Neural
fi_male_1 Male Professional Neural
fi_male_2 Male Casual Neural
fi_child_1 Child Neutral Neural

Voice Selection

# Use specific voice
result = client.translate_audio(
    audio_path="speech.mp3",
    target="fi",
    voice="fi_female_1",
    speed=1.0,           # Normal speed
    pitch=1.0            # Normal pitch
)

Voice Cloning

# Clone speaker's voice in target language
result = client.translate_audio(
    audio_path="ceo_speech.mp3",
    target="fi",
    voice_cloning=True,
    reference_audio="ceo_sample.mp3"  # 30+ seconds
)

Quality Settings

from pauhu import Pauhu

client = Pauhu()

# High-quality settings for production
result = client.translate_audio(
    audio_path="audiobook.mp3",
    target="fi",
    quality={
        "transcription": "whisper-large",
        "translation": "quality",
        "tts": "neural-hd",
        "sample_rate": 48000,
        "bitrate": 320
    }
)

# Fast settings for real-time
result = client.translate_audio(
    audio_path="call.mp3",
    target="fi",
    quality={
        "transcription": "whisper-small",
        "translation": "fast",
        "tts": "neural",
        "sample_rate": 16000,
        "bitrate": 128
    }
)

Supported Formats

Images

Format OCR Overlay Notes
PNG Transparency preserved
JPEG Quality maintained
TIFF Multi-page support
PDF Vector + raster
WebP Modern web format

Audio

Format Input Output Notes
MP3 Universal
WAV Lossless
FLAC Lossless compressed
OGG Open format
M4A Apple format

Video

Format Subtitles Dubbing Notes
MP4 H.264/H.265
MKV Multi-track
WebM Web-optimized
MOV Apple format
AVI Legacy

Getting Started

from pauhu import Pauhu

client = Pauhu()

# Translate image
image_result = client.translate_image(
    image_path="sign.jpg",
    target="fi"
)
print(image_result.translation)

# Translate audio
audio_result = client.translate_audio(
    audio_path="speech.mp3",
    target="fi"
)
audio_result.save("speech_fi.mp3")

# Generate subtitles
video_result = client.translate_video(
    video_path="video.mp4",
    target="fi",
    output="subtitles"
)
video_result.save_subtitles("video_fi.srt")