OCR from Text Messages

What we're building

A webhook-driven pipeline that receives inbound text messages (via Twilio), downloads the attached image, runs OCR to extract text, and replies with the result. The heavy lifting — downloading, OCR, and replying — happens in a remote function that scales independently from the webhook receiver.

This is a common pattern for SMS-based workflows: receipt scanning, document intake, ID verification, or any case where users send photos and expect a text reply.

The OCR worker

The worker function receives a message payload, downloads the image from the MMS URL, runs OCR with Tesseract, and sends the extracted text back via Twilio.

Container definition

Tesseract needs system packages, so we bake them into the image:

# ocr_worker.py
import json
import os

import chalkcompute
from chalkcompute import Container, Image, Secret, Volume

ocr_image = (
    Image.base("python:3.12-slim")
    .run_commands(
        "apt-get update -qq && apt-get install -y -qq tesseract-ocr libtesseract-dev",
    )
    .pip_install(["pytesseract", "Pillow", "httpx", "twilio"])
)

ocr_container = Container(
    image=ocr_image,
    cpu="2",
    memory="2Gi",
    min_instances=0,
    max_instances=8,
    max_concurrent_requests=4,
    secrets=[
        Secret.from_env("TWILIO_ACCOUNT_SID"),
        Secret.from_env("TWILIO_AUTH_TOKEN"),
        Secret.from_env("TWILIO_PHONE_NUMBER"),
    ],
    volumes=[("ocr-archive", "/archive")],
)

The volume at /archive stores every processed image and its extracted text — useful for auditing or reprocessing later.

Function implementation

@chalkcompute.function(name="ocr-image", container=ocr_container)
def ocr_image(message_json: str) -> str:
    """Download an MMS image, run OCR, archive the result, and reply via SMS."""
    import io
    import httpx
    import pytesseract
    from PIL import Image as PILImage
    from twilio.rest import Client
    from datetime import datetime, timezone

    message = json.loads(message_json)
    image_url = message["image_url"]
    from_number = message["from_number"]
    message_sid = message["message_sid"]

    # 1. Download the image from Twilio's media URL.
    #    Twilio media URLs require basic auth with your account credentials.
    resp = httpx.get(
        image_url,
        auth=(os.environ["TWILIO_ACCOUNT_SID"], os.environ["TWILIO_AUTH_TOKEN"]),
        follow_redirects=True,
        timeout=30,
    )
    resp.raise_for_status()
    img = PILImage.open(io.BytesIO(resp.content))

    # 2. Run OCR.
    extracted_text = pytesseract.image_to_string(img).strip()

    if not extracted_text:
        extracted_text = "(no text detected in image)"

    # 3. Archive the image and result to the volume.
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    archive_dir = f"/archive/{from_number}/{timestamp}_{message_sid}"
    os.makedirs(archive_dir, exist_ok=True)

    img.save(os.path.join(archive_dir, "image.png"))
    with open(os.path.join(archive_dir, "text.txt"), "w") as f:
        f.write(extracted_text)

    # 4. Reply via SMS with the extracted text.
    twilio = Client(
        os.environ["TWILIO_ACCOUNT_SID"],
        os.environ["TWILIO_AUTH_TOKEN"],
    )

    # Truncate to SMS-friendly length.
    reply_body = extracted_text[:1500]
    if len(extracted_text) > 1500:
        reply_body += "\n\n(truncated)"

    twilio.messages.create(
        body=f"Extracted text:\n\n{reply_body}",
        from_=os.environ["TWILIO_PHONE_NUMBER"],
        to=from_number,
    )

    return f"{message_sid}:ok:{len(extracted_text)} chars"

Deploy the worker:

python ocr_worker.py

The webhook receiver

Twilio sends an HTTP POST to your webhook URL whenever a text message arrives. The receiver parses the Twilio payload, checks for an image attachment, and dispatches the OCR function.

# webhook.py
import json
import logging
from fastapi import FastAPI, Form
from fastapi.responses import Response

import chalkcompute

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

app = FastAPI()

ocr_fn = chalkcompute.function_ref("ocr-image")


@app.on_event("startup")
def on_startup() -> None:
    ocr_fn.wait_ready()
    log.info("OCR worker ready")


@app.post("/sms")
async def handle_sms(
    From: str = Form(...),
    Body: str = Form(""),
    NumMedia: str = Form("0"),
    MessageSid: str = Form(""),
    MediaUrl0: str = Form(None),
    MediaContentType0: str = Form(None),
) -> Response:
    """Handle an inbound Twilio SMS/MMS webhook."""

    if int(NumMedia) == 0 or MediaUrl0 is None:
        # No image attached — return a TwiML response.
        return Response(
            content=(
                '<?xml version="1.0" encoding="UTF-8"?>'
                "<Response><Message>Send me a photo and I'll extract the text from it.</Message></Response>"
            ),
            media_type="application/xml",
        )

    if not MediaContentType0.startswith("image/"):
        return Response(
            content=(
                '<?xml version="1.0" encoding="UTF-8"?>'
                "<Response><Message>I can only process images, not other file types.</Message></Response>"
            ),
            media_type="application/xml",
        )

    # Dispatch to the OCR worker — don't block the webhook response.
    payload = json.dumps({
        "image_url": MediaUrl0,
        "from_number": From,
        "message_sid": MessageSid,
        "body": Body,
    })
    ocr_fn.async_call(payload)

    log.info("Dispatched OCR for message %s from %s", MessageSid, From)

    # Return empty TwiML — the worker will send the reply directly.
    return Response(
        content='<?xml version="1.0" encoding="UTF-8"?><Response></Response>',
        media_type="application/xml",
    )


@app.get("/health")
def health() -> dict:
    return {"status": "ok"}

The webhook returns immediately with empty TwiML. The OCR worker handles the reply asynchronously via the Twilio API, so users don’t see a timeout even if OCR takes several seconds.

Deploy the webhook

# deploy_webhook.py
from chalkcompute import Container, Image, Secret

image = (
    Image.base("python:3.12-slim")
    .pip_install(["fastapi", "uvicorn", "chalkcompute", "python-multipart"])
    .add_local_file("webhook.py", "/app/webhook.py")
)

container = Container(
    image=image,
    name="sms-ocr-webhook",
    port=8000,
    cpu="0.5",
    memory="512Mi",
    secrets=[
        Secret.from_env("TWILIO_AUTH_TOKEN"),
    ],
    entrypoint=[
        "uvicorn", "webhook:app",
        "--host", "0.0.0.0",
        "--port", "8000",
        "--app-dir", "/app",
    ],
).run()

print(f"Webhook URL: {container.info.web_url}/sms")
print("Set this as your Twilio phone number's webhook URL.")

python deploy_webhook.py

Copy the /sms URL into your Twilio phone number’s messaging webhook configuration.

How it fits together

  User's phone          Twilio              Webhook Container       OCR Workers
       │                   │                       │                    │
       │  MMS (photo)      │                       │                    │
       │──────────────────▸│                       │                    │
       │                   │  POST /sms            │                    │
       │                   │  (From, MediaUrl0)    │                    │
       │                   │──────────────────────▸│                    │
       │                   │                       │                    │
       │                   │  empty TwiML          │                    │
       │                   │◂──────────────────────│                    │
       │                   │                       │                    │
       │                   │                       │  async_call(msg)   │
       │                   │                       │───────────────────▸│
       │                   │                       │                    │
       │                   │                       │                    │── download image
       │                   │                       │                    │── tesseract OCR
       │                   │                       │                    │── archive to Volume
       │                   │                       │                    │
       │                   │     twilio.messages.create(reply)          │
       │                   │◂───────────────────────────────────────────│
       │                   │                       │                    │
       │  SMS reply with   │                       │                    │
       │  extracted text   │                       │                    │
       │◂──────────────────│                       │                    │
       │                   │                       │                    │

Browsing the archive

Every processed image is saved to the volume, organized by phone number and timestamp. You can browse or reprocess them from any script:

from chalkcompute import Volume

vol = Volume(name="ocr-archive")

# List recent OCR results for a number.
for entry in vol.listdir("+15551234567/"):
    print(entry.path)

# Read extracted text from a specific run.
text = vol.read_file("+15551234567/20260410_143022_SM1234abc/text.txt")
print(text.decode())

​What we're building

​The OCR worker

​Container definition

​Function implementation

​The webhook receiver

​Deploy the webhook

​How it fits together

​Browsing the archive

On this page