LangExtract

LangExtract is Google's open-source Python library for extracting structured information from unstructured text using Large Language Models (LLMs). It provides precise source grounding, interactive visualizations, and supports multiple model providers.

Installation

# Basic installation
pip install langextract

# For development (from source)
git clone https://github.com/google/langextract.git
cd langextract
pip install -e .

Quick Start

import langextract as lx

# Basic extraction
result = lx.extract(
    text_or_documents="Your unstructured text here...",
    prompt_description="Extract names, dates, and locations",
    examples=[
        {"input": "John visited Paris on May 15th", 
         "output": {"names": ["John"], "places": ["Paris"], "dates": ["May 15th"]}}
    ],
    model_id="gemini-2.0-flash-exp"
)

# Access results
print(result.extractions)
print(result.visualize())  # Interactive HTML visualization

Core Components

1. Basic Extraction

import langextract as lx

# Simple extraction with few-shot examples
result = lx.extract(
    text_or_documents=input_text,
    prompt_description="Extract character names and their emotions",
    examples=[
        {
            "input": "Alice felt happy about the good news",
            "output": {
                "characters": [{"name": "Alice", "emotion": "happy"}]
            }
        }
    ],
    model_id="gemini-2.0-flash-exp"
)

# Check extraction results
for extraction in result.extractions:
    print(f"Text: {extraction.text}")
    print(f"Data: {extraction.data}")
    print(f"Source spans: {extraction.source_spans}")

2. Document Processing

# Process multiple documents
documents = [
    {"text": "Document 1 content...", "metadata": {"source": "doc1.txt"}},
    {"text": "Document 2 content...", "metadata": {"source": "doc2.txt"}},
]

result = lx.extract(
    text_or_documents=documents,
    prompt_description="Extract key findings and recommendations",
    examples=[...],
    model_id="gemini-2.0-flash-exp"
)

3. Model Configuration

# Using different models
result = lx.extract(
    text_or_documents=text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.0-flash-exp",  # Recommended for speed
    # model_id="gemini-2.0-pro",      # For complex reasoning
    # model_id="gpt-4o-mini",         # OpenAI alternative
)

# Configure model parameters
result = lx.extract(
    text_or_documents=text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.0-flash-exp",
    generation_config={
        "temperature": 0.1,
        "max_output_tokens": 8192,
        "top_p": 0.95
    }
)

Advanced Features

1. Source Grounding & Visualization

# Extract with precise source tracking
result = lx.extract(
    text_or_documents=long_text,
    prompt_description="Extract medical conditions and treatments",
    examples=[...],
    model_id="gemini-2.0-flash-exp"
)

# Generate interactive visualization
html_viz = result.visualize()

# Save visualization to file
with open("extraction_results.html", "w") as f:
    f.write(html_viz)

# Access source spans for each extraction
for extraction in result.extractions:
    for entity in extraction.data.get("entities", []):
        spans = extraction.source_spans.get(entity["id"], [])
        print(f"Entity: {entity['text']} found at positions: {spans}")

2. Complex Schema Extraction

# Define complex extraction schema
medical_examples = [
    {
        "input": "Patient John Smith, 45, diagnosed with hypertension. Prescribed lisinopril 10mg daily.",
        "output": {
            "patient": {
                "name": "John Smith",
                "age": 45,
                "conditions": ["hypertension"],
                "medications": [
                    {
                        "name": "lisinopril",
                        "dosage": "10mg",
                        "frequency": "daily"
                    }
                ]
            }
        }
    }
]

result = lx.extract(
    text_or_documents=medical_report,
    prompt_description="Extract patient information, conditions, and medications",
    examples=medical_examples,
    model_id="gemini-2.0-flash-exp"
)

3. Batch Processing

# Process multiple documents efficiently
large_document_set = [
    {"text": doc1_text, "metadata": {"source": "report1.pdf"}},
    {"text": doc2_text, "metadata": {"source": "report2.pdf"}},
    # ... more documents
]

# Parallel processing for large datasets
result = lx.extract(
    text_or_documents=large_document_set,
    prompt_description="Extract key metrics and insights",
    examples=examples,
    model_id="gemini-2.0-flash-exp",
    max_workers=4  # Control parallel processing
)

# Process results per document
for doc_result in result.extractions:
    source = doc_result.metadata.get("source", "unknown")
    print(f"Results from {source}: {doc_result.data}")

4. Custom Output Parsers

# Define custom parsing logic
def parse_financial_data(extraction_result):
    """Custom parser for financial documents"""
    parsed_data = {}
    for extraction in extraction_result.extractions:
        # Custom processing logic
        parsed_data[extraction.metadata.get("source")] = {
            "revenue": extraction.data.get("revenue"),
            "expenses": extraction.data.get("expenses"),
            "profit": extraction.data.get("profit")
        }
    return parsed_data

# Use custom parser
result = lx.extract(
    text_or_documents=financial_reports,
    prompt_description="Extract revenue, expenses, and profit figures",
    examples=financial_examples,
    model_id="gemini-2.0-flash-exp"
)

parsed_results = parse_financial_data(result)

Common Use Cases

1. Medical Report Processing

medical_examples = [
    {
        "input": "Patient presents with chest pain. ECG shows normal sinus rhythm. Blood pressure 140/90.",
        "output": {
            "symptoms": ["chest pain"],
            "tests": [
                {"name": "ECG", "result": "normal sinus rhythm"},
                {"name": "blood pressure", "result": "140/90"}
            ],
            "assessment": "hypertensive"
        }
    }
]

result = lx.extract(
    text_or_documents=medical_notes,
    prompt_description="Extract symptoms, test results, and clinical assessments",
    examples=medical_examples,
    model_id="gemini-2.0-flash-exp"
)

2. Legal Document Analysis

legal_examples = [
    {
        "input": "The agreement between ABC Corp and XYZ Inc, dated January 15, 2024, stipulates a payment of $50,000.",
        "output": {
            "parties": ["ABC Corp", "XYZ Inc"],
            "date": "January 15, 2024",
            "financial_terms": [{"amount": "$50,000", "type": "payment"}],
            "document_type": "agreement"
        }
    }
]

result = lx.extract(
    text_or_documents=legal_documents,
    prompt_description="Extract parties, dates, financial terms, and document types",
    examples=legal_examples,
    model_id="gemini-2.0-pro"  # Use Pro for complex legal reasoning
)

3. Customer Feedback Analysis

feedback_examples = [
    {
        "input": "The product quality is excellent but shipping was slow. Customer service was very helpful.",
        "output": {
            "sentiment": "mixed",
            "aspects": [
                {"category": "product_quality", "sentiment": "positive", "text": "excellent"},
                {"category": "shipping", "sentiment": "negative", "text": "slow"},
                {"category": "customer_service", "sentiment": "positive", "text": "very helpful"}
            ]
        }
    }
]

result = lx.extract(
    text_or_documents=customer_reviews,
    prompt_description="Extract sentiment and specific aspects from customer feedback",
    examples=feedback_examples,
    model_id="gemini-2.0-flash-exp"
)

4. Research Paper Processing

research_examples = [
    {
        "input": "We conducted a randomized controlled trial with 200 participants. Results showed 85% efficacy (p<0.05).",
        "output": {
            "study_design": "randomized controlled trial",
            "sample_size": 200,
            "key_findings": [
                {"metric": "efficacy", "value": "85%", "significance": "p<0.05"}
            ],
            "study_type": "clinical trial"
        }
    }
]

result = lx.extract(
    text_or_documents=research_papers,
    prompt_description="Extract study methodology, sample sizes, and key findings",
    examples=research_examples,
    model_id="gemini-2.0-flash-exp"
)

Integration Patterns

1. With Pandas for Data Analysis

import pandas as pd
import langextract as lx

# Extract structured data
result = lx.extract(
    text_or_documents=documents,
    prompt_description="Extract financial metrics",
    examples=examples,
    model_id="gemini-2.0-flash-exp"
)

# Convert to DataFrame
data_rows = []
for extraction in result.extractions:
    for metric in extraction.data.get("metrics", []):
        data_rows.append({
            "source": extraction.metadata.get("source"),
            "metric_name": metric["name"],
            "value": metric["value"],
            "period": metric.get("period")
        })

df = pd.DataFrame(data_rows)
print(df.groupby("metric_name")["value"].mean())

2. With LangChain for RAG Systems

import langextract as lx
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

# Extract structured data first
extraction_result = lx.extract(
    text_or_documents=documents,
    prompt_description="Extract key concepts and definitions",
    examples=examples,
    model_id="gemini-2.0-flash-exp"
)

# Create vector store from extracted data
texts = []
metadatas = []

for extraction in extraction_result.extractions:
    for concept in extraction.data.get("concepts", []):
        texts.append(f"{concept['term']}: {concept['definition']}")
        metadatas.append({
            "source": extraction.metadata.get("source"),
            "term": concept["term"],
            "source_spans": extraction.source_spans.get(concept["id"], [])
        })

vectorstore = Chroma.from_texts(
    texts=texts,
    metadatas=metadatas,
    embedding=OpenAIEmbeddings()
)

3. With FastAPI for API Services

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import langextract as lx

app = FastAPI()

class ExtractionRequest(BaseModel):
    text: str
    task_description: str
    examples: list

class ExtractionResponse(BaseModel):
    extractions: list
    visualization_html: str

@app.post("/extract", response_model=ExtractionResponse)
async def extract_information(request: ExtractionRequest):
    try:
        result = lx.extract(
            text_or_documents=request.text,
            prompt_description=request.task_description,
            examples=request.examples,
            model_id="gemini-2.0-flash-exp"
        )

        return ExtractionResponse(
            extractions=[ext.data for ext in result.extractions],
            visualization_html=result.visualize()
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

Performance Optimization

1. Efficient Example Selection

# Use minimal but representative examples
efficient_examples = [
    {
        "input": "Short representative text",
        "output": {"key_field": "value"}
    },
    # Limit to 3-5 high-quality examples
]

# Avoid overly complex output schemas
result = lx.extract(
    text_or_documents=text,
    prompt_description="Clear, specific task description",
    examples=efficient_examples,
    model_id="gemini-2.0-flash-exp"  # Faster for most tasks
)

2. Chunking Strategy for Long Documents

def chunk_document(text, max_chunk_size=8000):
    """Split document into overlapping chunks"""
    chunks = []
    words = text.split()

    for i in range(0, len(words), max_chunk_size - 200):  # 200 word overlap
        chunk = " ".join(words[i:i + max_chunk_size])
        chunks.append(chunk)

    return chunks

# Process long documents efficiently
long_text = "Very long document content..."
chunks = chunk_document(long_text)

results = []
for chunk in chunks:
    result = lx.extract(
        text_or_documents=chunk,
        prompt_description=prompt,
        examples=examples,
        model_id="gemini-2.0-flash-exp"
    )
    results.append(result)

3. Caching and Rate Limiting

import time
from functools import lru_cache
import hashlib

@lru_cache(maxsize=100)
def cached_extract(text_hash, prompt, examples_str, model_id):
    """Cache extraction results for identical inputs"""
    return lx.extract(
        text_or_documents=text,
        prompt_description=prompt,
        examples=eval(examples_str),  # Be careful with eval in production
        model_id=model_id
    )

def extract_with_rate_limit(text, prompt, examples, model_id, delay=1.0):
    """Add rate limiting between API calls"""
    text_hash = hashlib.md5(text.encode()).hexdigest()
    examples_str = str(examples)

    result = cached_extract(text_hash, prompt, examples_str, model_id)
    time.sleep(delay)  # Rate limiting
    return result

Error Handling and Debugging

1. Robust Error Handling

import langextract as lx
from typing import Optional, List

def safe_extract(
    text: str, 
    prompt: str, 
    examples: List[dict],
    model_id: str = "gemini-2.0-flash-exp",
    max_retries: int = 3
) -> Optional[lx.ExtractionResult]:
    """Extract with error handling and retries"""

    for attempt in range(max_retries):
        try:
            result = lx.extract(
                text_or_documents=text,
                prompt_description=prompt,
                examples=examples,
                model_id=model_id
            )
            return result

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt == max_retries - 1:
                print(f"All {max_retries} attempts failed")
                return None
            time.sleep(2 ** attempt)  # Exponential backoff

    return None

# Usage
result = safe_extract(text, prompt, examples)
if result:
    print("Extraction successful")
    print(result.extractions)
else:
    print("Extraction failed after all retries")

2. Validation and Quality Checks

def validate_extraction_result(result: lx.ExtractionResult, expected_fields: List[str]) -> bool:
    """Validate extraction results"""
    if not result or not result.extractions:
        return False

    for extraction in result.extractions:
        if not extraction.data:
            return False

        # Check for expected fields
        for field in expected_fields:
            if field not in extraction.data:
                print(f"Missing field: {field}")
                return False

    return True

# Usage
result = lx.extract(...)
is_valid = validate_extraction_result(result, ["entities", "relationships"])

if not is_valid:
    print("Extraction result validation failed")

3. Debugging and Inspection

def debug_extraction(result: lx.ExtractionResult):
    """Debug extraction results"""
    print(f"Number of extractions: {len(result.extractions)}")

    for i, extraction in enumerate(result.extractions):
        print(f"\nExtraction {i + 1}:")
        print(f"  Text length: {len(extraction.text)}")
        print(f"  Data keys: {list(extraction.data.keys())}")
        print(f"  Source spans: {len(extraction.source_spans)}")
        print(f"  Metadata: {extraction.metadata}")

# Usage
result = lx.extract(...)
debug_extraction(result)

Best Practices

1. Example Design

# ✅ Good: Clear, specific examples
good_examples = [
    {
        "input": "Dr. Smith prescribed aspirin 81mg daily for cardiovascular protection",
        "output": {
            "physician": "Dr. Smith",
            "medication": {
                "name": "aspirin",
                "dose": "81mg",
                "frequency": "daily",
                "indication": "cardiovascular protection"
            }
        }
    }
]

# ❌ Avoid: Vague or inconsistent examples
bad_examples = [
    {
        "input": "Some text",
        "output": {"stuff": "things"}
    }
]

2. Prompt Engineering

# ✅ Good: Specific, actionable prompts
good_prompt = "Extract medication names, dosages, frequencies, and indications from clinical notes. Include the prescribing physician if mentioned."

# ❌ Avoid: Vague prompts
bad_prompt = "Extract medical information"

# Use the good prompt
result = lx.extract(
    text_or_documents=clinical_notes,
    prompt_description=good_prompt,
    examples=good_examples,
    model_id="gemini-2.0-flash-exp"
)

3. Model Selection Guidelines

# Choose model based on task complexity
def select_model(task_complexity: str) -> str:
    model_map = {
        "simple": "gemini-2.0-flash-exp",      # Fast, cost-effective
        "moderate": "gemini-2.0-flash-exp",    # Good balance
        "complex": "gemini-2.0-pro",           # Deep reasoning
        "specialized": "gemini-2.0-pro"        # Domain expertise
    }
    return model_map.get(task_complexity, "gemini-2.0-flash-exp")

# Usage
model_id = select_model("moderate")
result = lx.extract(
    text_or_documents=text,
    prompt_description=prompt,
    examples=examples,
    model_id=model_id
)

4. Output Quality Assurance

def ensure_output_quality(result: lx.ExtractionResult) -> bool:
    """Ensure extraction output meets quality standards"""
    quality_checks = {
        "has_extractions": len(result.extractions) > 0,
        "has_source_spans": all(
            len(ext.source_spans) > 0 for ext in result.extractions
        ),
        "data_not_empty": all(
            ext.data for ext in result.extractions
        )
    }

    passed_checks = sum(quality_checks.values())
    total_checks = len(quality_checks)

    print(f"Quality score: {passed_checks}/{total_checks}")
    return passed_checks == total_checks

# Usage
result = lx.extract(...)
if ensure_output_quality(result):
    print("High quality extraction")
else:
    print("Consider refining examples or prompt")

Troubleshooting

Common Issues

Empty Extractions python # Check input text length and examples if not result.extractions: print(f"Input length: {len(text)} characters") print(f"Number of examples: {len(examples)}") # Try simpler examples or clearer prompt
Inconsistent Output Format python # Ensure examples follow consistent schema # Use more specific prompt descriptions # Consider using fewer but higher-quality examples
Missing Source Spans python # Verify text preprocessing doesn't remove character positions # Check if extraction entities exist in source text
API Rate Limits python # Implement exponential backoff # Use caching for repeated requests # Consider batch processing

Debugging Checklist

[ ] Examples follow consistent format
[ ] Prompt is specific and actionable
[ ] Input text is well-formatted
[ ] Model selection matches task complexity
[ ] API credentials are properly configured
[ ] Rate limiting is implemented for production use

For the latest updates and detailed documentation, visit the LangExtract GitHub repository.