ollama/skills/pdf-skill/scripts/process_pdf.py

#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "pypdf",
#     "pdfplumber",
# ]
# ///
"""
PDF Processing Script for non-technical users.
Handles common PDF operations: info, text extraction, search, split, merge.

Usage: uv run scripts/process_pdf.py <filepath> <command> [args...] [--output <output_path>]
"""

import sys
import argparse
import re
from pathlib import Path


def load_pdf_pypdf(filepath):
    """Load PDF using pypdf."""
    from pypdf import PdfReader
    path = Path(filepath)
    if not path.exists():
        print(f"Error: File not found: {filepath}")
        sys.exit(1)
    try:
        return PdfReader(filepath)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        sys.exit(1)


def load_pdf_plumber(filepath):
    """Load PDF using pdfplumber (better for text/tables)."""
    import pdfplumber
    path = Path(filepath)
    if not path.exists():
        print(f"Error: File not found: {filepath}")
        sys.exit(1)
    try:
        return pdfplumber.open(filepath)
    except Exception as e:
        print(f"Error reading PDF: {e}")
        sys.exit(1)


def parse_page_range(pages_str, max_pages):
    """Parse page range string like '1,2,3' or '1-5' or '1,3-5,7'."""
    if not pages_str:
        return list(range(1, max_pages + 1))

    pages = set()
    parts = pages_str.split(',')
    for part in parts:
        part = part.strip()
        if '-' in part:
            start, end = part.split('-', 1)
            start = int(start.strip())
            end = int(end.strip())
            pages.update(range(start, end + 1))
        else:
            pages.add(int(part))

    # Filter to valid range and sort
    valid_pages = sorted([p for p in pages if 1 <= p <= max_pages])
    return valid_pages


def cmd_info(args):
    """Show PDF information."""
    reader = load_pdf_pypdf(args.filepath)

    print("=" * 60)
    print("PDF INFORMATION")
    print("=" * 60)

    print(f"\nFile: {args.filepath}")
    print(f"Pages: {len(reader.pages)}")

    # File size
    path = Path(args.filepath)
    size_bytes = path.stat().st_size
    if size_bytes < 1024:
        size_str = f"{size_bytes} bytes"
    elif size_bytes < 1024 * 1024:
        size_str = f"{size_bytes / 1024:.1f} KB"
    else:
        size_str = f"{size_bytes / (1024 * 1024):.1f} MB"
    print(f"Size: {size_str}")

    # Metadata
    meta = reader.metadata
    if meta:
        print("\n" + "-" * 40)
        print("METADATA:")
        print("-" * 40)
        if meta.title:
            print(f"  Title: {meta.title}")
        if meta.author:
            print(f"  Author: {meta.author}")
        if meta.subject:
            print(f"  Subject: {meta.subject}")
        if meta.creator:
            print(f"  Creator: {meta.creator}")
        if meta.creation_date:
            print(f"  Created: {meta.creation_date}")
        if meta.modification_date:
            print(f"  Modified: {meta.modification_date}")


def cmd_text(args):
    """Extract text from PDF."""
    pdf = load_pdf_plumber(args.filepath)

    pages = parse_page_range(args.pages, len(pdf.pages))

    print("=" * 60)
    if args.pages:
        print(f"TEXT EXTRACTION (pages {args.pages})")
    else:
        print("TEXT EXTRACTION (all pages)")
    print("=" * 60)

    for page_num in pages:
        page = pdf.pages[page_num - 1]  # 0-indexed
        text = page.extract_text() or ""

        print(f"\n--- Page {page_num} ---\n")
        if text.strip():
            print(text)
        else:
            print("(No text found on this page - may be an image or scan)")

    pdf.close()


def cmd_search(args):
    """Search for text in PDF."""
    if not args.query:
        print("Error: Please provide a search query")
        sys.exit(1)

    pdf = load_pdf_plumber(args.filepath)
    query = args.query.lower()

    print("=" * 60)
    print(f"SEARCH RESULTS: '{args.query}'")
    print("=" * 60)

    total_matches = 0

    for i, page in enumerate(pdf.pages):
        page_num = i + 1
        text = page.extract_text() or ""

        # Find matches with context
        text_lower = text.lower()
        if query in text_lower:
            # Count occurrences
            count = text_lower.count(query)
            total_matches += count

            print(f"\n--- Page {page_num} ({count} match{'es' if count > 1 else ''}) ---")

            # Show context around each match
            lines = text.split('\n')
            for j, line in enumerate(lines):
                if query in line.lower():
                    # Highlight the match (uppercase)
                    highlighted = re.sub(
                        f'({re.escape(args.query)})',
                        r'>>>\1<<<',
                        line,
                        flags=re.IGNORECASE
                    )
                    print(f"  {highlighted}")

    print(f"\n{'=' * 40}")
    if total_matches == 0:
        print(f"No matches found for '{args.query}'")
    else:
        print(f"Total: {total_matches} match{'es' if total_matches > 1 else ''} found")

    pdf.close()


def cmd_tables(args):
    """Extract tables from PDF."""
    pdf = load_pdf_plumber(args.filepath)

    print("=" * 60)
    print("TABLE EXTRACTION")
    print("=" * 60)

    table_count = 0

    for i, page in enumerate(pdf.pages):
        page_num = i + 1
        tables = page.extract_tables()

        if tables:
            for j, table in enumerate(tables):
                table_count += 1
                print(f"\n--- Table {table_count} (Page {page_num}) ---\n")

                # Print as CSV-like format
                for row in table:
                    # Clean up None values
                    cleaned = [str(cell).strip() if cell else "" for cell in row]
                    print(",".join(cleaned))

    if table_count == 0:
        print("\nNo tables found in this PDF.")
        print("Note: Table extraction works best with clearly structured tables.")
    else:
        print(f"\n{'=' * 40}")
        print(f"Total: {table_count} table{'s' if table_count > 1 else ''} found")

    pdf.close()


def cmd_count(args):
    """Count words and characters in PDF."""
    pdf = load_pdf_plumber(args.filepath)

    total_chars = 0
    total_words = 0
    page_stats = []

    for i, page in enumerate(pdf.pages):
        text = page.extract_text() or ""
        chars = len(text)
        words = len(text.split())
        total_chars += chars
        total_words += words
        page_stats.append((i + 1, words, chars))

    print("=" * 60)
    print("DOCUMENT STATISTICS")
    print("=" * 60)

    print(f"\nTotal pages: {len(pdf.pages)}")
    print(f"Total words: {total_words:,}")
    print(f"Total characters: {total_chars:,}")

    if len(pdf.pages) > 1:
        print(f"\nAverage words per page: {total_words // len(pdf.pages):,}")

        print("\n" + "-" * 40)
        print("PER-PAGE BREAKDOWN:")
        print("-" * 40)
        for page_num, words, chars in page_stats:
            print(f"  Page {page_num}: {words:,} words, {chars:,} chars")

    pdf.close()


def cmd_split(args):
    """Extract specific pages to a new PDF."""
    from pypdf import PdfReader, PdfWriter

    if not args.output:
        print("Error: Please specify output file with --output")
        sys.exit(1)

    reader = load_pdf_pypdf(args.filepath)
    pages = parse_page_range(args.pages, len(reader.pages))

    if not pages:
        print("Error: No valid pages specified")
        sys.exit(1)

    writer = PdfWriter()

    for page_num in pages:
        writer.add_page(reader.pages[page_num - 1])

    with open(args.output, 'wb') as f:
        writer.write(f)

    print(f"Extracted {len(pages)} page(s) to: {args.output}")
    print(f"Pages included: {', '.join(map(str, pages))}")


def cmd_merge(args):
    """Merge multiple PDFs into one."""
    from pypdf import PdfReader, PdfWriter

    if not args.output:
        print("Error: Please specify output file with --output")
        sys.exit(1)

    # Collect all input files
    files = [args.filepath]
    if args.query:
        files.append(args.query)
    if args.pages:
        files.append(args.pages)
    # Check for additional files in remaining args

    # Validate all files exist
    for f in files:
        if not Path(f).exists():
            print(f"Error: File not found: {f}")
            sys.exit(1)

    writer = PdfWriter()
    total_pages = 0

    for filepath in files:
        reader = PdfReader(filepath)
        for page in reader.pages:
            writer.add_page(page)
            total_pages += 1
        print(f"  Added: {filepath} ({len(reader.pages)} pages)")

    with open(args.output, 'wb') as f:
        writer.write(f)

    print(f"\nMerged {len(files)} files ({total_pages} total pages) to: {args.output}")


def main():
    parser = argparse.ArgumentParser(description='Process PDF files')
    parser.add_argument('filepath', help='Path to PDF file (or "merge" command)')
    parser.add_argument('command', nargs='?', default='info',
                        help='Command: info, text, search, tables, count, split, merge')
    parser.add_argument('query', nargs='?', help='Search query or second file for merge')
    parser.add_argument('--pages', '-p', help='Page range (e.g., "1-3" or "1,2,5")')
    parser.add_argument('--output', '-o', help='Output file path')

    args = parser.parse_args()

    # Handle merge as special case (first arg is "merge")
    if args.filepath == 'merge':
        if not args.command:
            print("Error: merge requires at least 2 PDF files")
            print("Usage: process_pdf.py merge file1.pdf file2.pdf --output combined.pdf")
            sys.exit(1)
        # Shift args for merge
        args.filepath = args.command
        args.command = 'merge'

    # Run the command
    commands = {
        'info': cmd_info,
        'text': cmd_text,
        'search': cmd_search,
        'tables': cmd_tables,
        'count': cmd_count,
        'split': cmd_split,
        'merge': cmd_merge,
    }

    if args.command not in commands:
        print(f"Error: Unknown command '{args.command}'")
        print(f"Available commands: {', '.join(commands.keys())}")
        sys.exit(1)

    commands[args.command](args)


if __name__ == "__main__":
    main()