368 lines
10 KiB
Python
368 lines
10 KiB
Python
#!/usr/bin/env -S uv run --script
|
|
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = [
|
|
# "pypdf",
|
|
# "pdfplumber",
|
|
# ]
|
|
# ///
|
|
"""
|
|
PDF Processing Script for non-technical users.
|
|
Handles common PDF operations: info, text extraction, search, split, merge.
|
|
|
|
Usage: uv run scripts/process_pdf.py <filepath> <command> [args...] [--output <output_path>]
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def load_pdf_pypdf(filepath):
|
|
"""Load PDF using pypdf."""
|
|
from pypdf import PdfReader
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
print(f"Error: File not found: {filepath}")
|
|
sys.exit(1)
|
|
try:
|
|
return PdfReader(filepath)
|
|
except Exception as e:
|
|
print(f"Error reading PDF: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def load_pdf_plumber(filepath):
|
|
"""Load PDF using pdfplumber (better for text/tables)."""
|
|
import pdfplumber
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
print(f"Error: File not found: {filepath}")
|
|
sys.exit(1)
|
|
try:
|
|
return pdfplumber.open(filepath)
|
|
except Exception as e:
|
|
print(f"Error reading PDF: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def parse_page_range(pages_str, max_pages):
|
|
"""Parse page range string like '1,2,3' or '1-5' or '1,3-5,7'."""
|
|
if not pages_str:
|
|
return list(range(1, max_pages + 1))
|
|
|
|
pages = set()
|
|
parts = pages_str.split(',')
|
|
for part in parts:
|
|
part = part.strip()
|
|
if '-' in part:
|
|
start, end = part.split('-', 1)
|
|
start = int(start.strip())
|
|
end = int(end.strip())
|
|
pages.update(range(start, end + 1))
|
|
else:
|
|
pages.add(int(part))
|
|
|
|
# Filter to valid range and sort
|
|
valid_pages = sorted([p for p in pages if 1 <= p <= max_pages])
|
|
return valid_pages
|
|
|
|
|
|
def cmd_info(args):
|
|
"""Show PDF information."""
|
|
reader = load_pdf_pypdf(args.filepath)
|
|
|
|
print("=" * 60)
|
|
print("PDF INFORMATION")
|
|
print("=" * 60)
|
|
|
|
print(f"\nFile: {args.filepath}")
|
|
print(f"Pages: {len(reader.pages)}")
|
|
|
|
# File size
|
|
path = Path(args.filepath)
|
|
size_bytes = path.stat().st_size
|
|
if size_bytes < 1024:
|
|
size_str = f"{size_bytes} bytes"
|
|
elif size_bytes < 1024 * 1024:
|
|
size_str = f"{size_bytes / 1024:.1f} KB"
|
|
else:
|
|
size_str = f"{size_bytes / (1024 * 1024):.1f} MB"
|
|
print(f"Size: {size_str}")
|
|
|
|
# Metadata
|
|
meta = reader.metadata
|
|
if meta:
|
|
print("\n" + "-" * 40)
|
|
print("METADATA:")
|
|
print("-" * 40)
|
|
if meta.title:
|
|
print(f" Title: {meta.title}")
|
|
if meta.author:
|
|
print(f" Author: {meta.author}")
|
|
if meta.subject:
|
|
print(f" Subject: {meta.subject}")
|
|
if meta.creator:
|
|
print(f" Creator: {meta.creator}")
|
|
if meta.creation_date:
|
|
print(f" Created: {meta.creation_date}")
|
|
if meta.modification_date:
|
|
print(f" Modified: {meta.modification_date}")
|
|
|
|
|
|
def cmd_text(args):
|
|
"""Extract text from PDF."""
|
|
pdf = load_pdf_plumber(args.filepath)
|
|
|
|
pages = parse_page_range(args.pages, len(pdf.pages))
|
|
|
|
print("=" * 60)
|
|
if args.pages:
|
|
print(f"TEXT EXTRACTION (pages {args.pages})")
|
|
else:
|
|
print("TEXT EXTRACTION (all pages)")
|
|
print("=" * 60)
|
|
|
|
for page_num in pages:
|
|
page = pdf.pages[page_num - 1] # 0-indexed
|
|
text = page.extract_text() or ""
|
|
|
|
print(f"\n--- Page {page_num} ---\n")
|
|
if text.strip():
|
|
print(text)
|
|
else:
|
|
print("(No text found on this page - may be an image or scan)")
|
|
|
|
pdf.close()
|
|
|
|
|
|
def cmd_search(args):
|
|
"""Search for text in PDF."""
|
|
if not args.query:
|
|
print("Error: Please provide a search query")
|
|
sys.exit(1)
|
|
|
|
pdf = load_pdf_plumber(args.filepath)
|
|
query = args.query.lower()
|
|
|
|
print("=" * 60)
|
|
print(f"SEARCH RESULTS: '{args.query}'")
|
|
print("=" * 60)
|
|
|
|
total_matches = 0
|
|
|
|
for i, page in enumerate(pdf.pages):
|
|
page_num = i + 1
|
|
text = page.extract_text() or ""
|
|
|
|
# Find matches with context
|
|
text_lower = text.lower()
|
|
if query in text_lower:
|
|
# Count occurrences
|
|
count = text_lower.count(query)
|
|
total_matches += count
|
|
|
|
print(f"\n--- Page {page_num} ({count} match{'es' if count > 1 else ''}) ---")
|
|
|
|
# Show context around each match
|
|
lines = text.split('\n')
|
|
for j, line in enumerate(lines):
|
|
if query in line.lower():
|
|
# Highlight the match (uppercase)
|
|
highlighted = re.sub(
|
|
f'({re.escape(args.query)})',
|
|
r'>>>\1<<<',
|
|
line,
|
|
flags=re.IGNORECASE
|
|
)
|
|
print(f" {highlighted}")
|
|
|
|
print(f"\n{'=' * 40}")
|
|
if total_matches == 0:
|
|
print(f"No matches found for '{args.query}'")
|
|
else:
|
|
print(f"Total: {total_matches} match{'es' if total_matches > 1 else ''} found")
|
|
|
|
pdf.close()
|
|
|
|
|
|
def cmd_tables(args):
|
|
"""Extract tables from PDF."""
|
|
pdf = load_pdf_plumber(args.filepath)
|
|
|
|
print("=" * 60)
|
|
print("TABLE EXTRACTION")
|
|
print("=" * 60)
|
|
|
|
table_count = 0
|
|
|
|
for i, page in enumerate(pdf.pages):
|
|
page_num = i + 1
|
|
tables = page.extract_tables()
|
|
|
|
if tables:
|
|
for j, table in enumerate(tables):
|
|
table_count += 1
|
|
print(f"\n--- Table {table_count} (Page {page_num}) ---\n")
|
|
|
|
# Print as CSV-like format
|
|
for row in table:
|
|
# Clean up None values
|
|
cleaned = [str(cell).strip() if cell else "" for cell in row]
|
|
print(",".join(cleaned))
|
|
|
|
if table_count == 0:
|
|
print("\nNo tables found in this PDF.")
|
|
print("Note: Table extraction works best with clearly structured tables.")
|
|
else:
|
|
print(f"\n{'=' * 40}")
|
|
print(f"Total: {table_count} table{'s' if table_count > 1 else ''} found")
|
|
|
|
pdf.close()
|
|
|
|
|
|
def cmd_count(args):
|
|
"""Count words and characters in PDF."""
|
|
pdf = load_pdf_plumber(args.filepath)
|
|
|
|
total_chars = 0
|
|
total_words = 0
|
|
page_stats = []
|
|
|
|
for i, page in enumerate(pdf.pages):
|
|
text = page.extract_text() or ""
|
|
chars = len(text)
|
|
words = len(text.split())
|
|
total_chars += chars
|
|
total_words += words
|
|
page_stats.append((i + 1, words, chars))
|
|
|
|
print("=" * 60)
|
|
print("DOCUMENT STATISTICS")
|
|
print("=" * 60)
|
|
|
|
print(f"\nTotal pages: {len(pdf.pages)}")
|
|
print(f"Total words: {total_words:,}")
|
|
print(f"Total characters: {total_chars:,}")
|
|
|
|
if len(pdf.pages) > 1:
|
|
print(f"\nAverage words per page: {total_words // len(pdf.pages):,}")
|
|
|
|
print("\n" + "-" * 40)
|
|
print("PER-PAGE BREAKDOWN:")
|
|
print("-" * 40)
|
|
for page_num, words, chars in page_stats:
|
|
print(f" Page {page_num}: {words:,} words, {chars:,} chars")
|
|
|
|
pdf.close()
|
|
|
|
|
|
def cmd_split(args):
|
|
"""Extract specific pages to a new PDF."""
|
|
from pypdf import PdfReader, PdfWriter
|
|
|
|
if not args.output:
|
|
print("Error: Please specify output file with --output")
|
|
sys.exit(1)
|
|
|
|
reader = load_pdf_pypdf(args.filepath)
|
|
pages = parse_page_range(args.pages, len(reader.pages))
|
|
|
|
if not pages:
|
|
print("Error: No valid pages specified")
|
|
sys.exit(1)
|
|
|
|
writer = PdfWriter()
|
|
|
|
for page_num in pages:
|
|
writer.add_page(reader.pages[page_num - 1])
|
|
|
|
with open(args.output, 'wb') as f:
|
|
writer.write(f)
|
|
|
|
print(f"Extracted {len(pages)} page(s) to: {args.output}")
|
|
print(f"Pages included: {', '.join(map(str, pages))}")
|
|
|
|
|
|
def cmd_merge(args):
|
|
"""Merge multiple PDFs into one."""
|
|
from pypdf import PdfReader, PdfWriter
|
|
|
|
if not args.output:
|
|
print("Error: Please specify output file with --output")
|
|
sys.exit(1)
|
|
|
|
# Collect all input files
|
|
files = [args.filepath]
|
|
if args.query:
|
|
files.append(args.query)
|
|
if args.pages:
|
|
files.append(args.pages)
|
|
# Check for additional files in remaining args
|
|
|
|
# Validate all files exist
|
|
for f in files:
|
|
if not Path(f).exists():
|
|
print(f"Error: File not found: {f}")
|
|
sys.exit(1)
|
|
|
|
writer = PdfWriter()
|
|
total_pages = 0
|
|
|
|
for filepath in files:
|
|
reader = PdfReader(filepath)
|
|
for page in reader.pages:
|
|
writer.add_page(page)
|
|
total_pages += 1
|
|
print(f" Added: {filepath} ({len(reader.pages)} pages)")
|
|
|
|
with open(args.output, 'wb') as f:
|
|
writer.write(f)
|
|
|
|
print(f"\nMerged {len(files)} files ({total_pages} total pages) to: {args.output}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Process PDF files')
|
|
parser.add_argument('filepath', help='Path to PDF file (or "merge" command)')
|
|
parser.add_argument('command', nargs='?', default='info',
|
|
help='Command: info, text, search, tables, count, split, merge')
|
|
parser.add_argument('query', nargs='?', help='Search query or second file for merge')
|
|
parser.add_argument('--pages', '-p', help='Page range (e.g., "1-3" or "1,2,5")')
|
|
parser.add_argument('--output', '-o', help='Output file path')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Handle merge as special case (first arg is "merge")
|
|
if args.filepath == 'merge':
|
|
if not args.command:
|
|
print("Error: merge requires at least 2 PDF files")
|
|
print("Usage: process_pdf.py merge file1.pdf file2.pdf --output combined.pdf")
|
|
sys.exit(1)
|
|
# Shift args for merge
|
|
args.filepath = args.command
|
|
args.command = 'merge'
|
|
|
|
# Run the command
|
|
commands = {
|
|
'info': cmd_info,
|
|
'text': cmd_text,
|
|
'search': cmd_search,
|
|
'tables': cmd_tables,
|
|
'count': cmd_count,
|
|
'split': cmd_split,
|
|
'merge': cmd_merge,
|
|
}
|
|
|
|
if args.command not in commands:
|
|
print(f"Error: Unknown command '{args.command}'")
|
|
print(f"Available commands: {', '.join(commands.keys())}")
|
|
sys.exit(1)
|
|
|
|
commands[args.command](args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|