#!/usr/bin/env python3 # /// script # requires-python = ">=3.11" # dependencies = [ # "transformers>=4.57.0", # "jinja2", # "fastapi", # "uvicorn", # "pydantic", # "requests", # ] # /// """ Chat Template Testing Tool Test HuggingFace chat templates against Ollama renderers. Usage: # Run predefined test cases against a HuggingFace model uv run cmd/chat_template/chat_template.py --model PrimeIntellect/INTELLECT-3 # Compare HuggingFace output with Ollama renderer uv run cmd/chat_template/chat_template.py --model PrimeIntellect/INTELLECT-3 --ollama-model intellect3 # Start server for manual curl testing uv run cmd/chat_template/chat_template.py --serve # Show chat template for a model uv run cmd/chat_template/chat_template.py --model PrimeIntellect/INTELLECT-3 --show-template """ import argparse import json import sys from typing import Any from transformers import AutoTokenizer TEST_CASES = [ { "name": "basic_user_message", "messages": [{"role": "user", "content": "Hello!"}], "tools": None, }, { "name": "with_system_message", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}, ], "tools": None, }, { "name": "multi_turn_conversation", "messages": [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, {"role": "user", "content": "How are you?"}, ], "tools": None, }, { "name": "with_tools", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the weather?"}, ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather", "parameters": { "type": "object", "required": ["location"], "properties": { "location": {"type": "string", "description": "The city"} }, }, }, } ], }, { "name": "tool_call_and_response", "messages": [ {"role": "user", "content": "What is the weather in SF?"}, { "role": "assistant", "content": "Let me check the weather.", "tool_calls": [ { "id": "call_1", "type": "function", "function": { "name": "get_weather", "arguments": {"location": "San Francisco"}, }, } ], }, {"role": "tool", "content": '{"temperature": 68}', "tool_call_id": "call_1"}, ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather", "parameters": { "type": "object", "required": ["location"], "properties": { "location": {"type": "string", "description": "The city"} }, }, }, } ], }, { "name": "parallel_tool_calls", "messages": [ {"role": "user", "content": "Get weather in SF and NYC"}, { "role": "assistant", "tool_calls": [ { "id": "call_1", "type": "function", "function": { "name": "get_weather", "arguments": {"location": "San Francisco"}, }, }, { "id": "call_2", "type": "function", "function": { "name": "get_weather", "arguments": {"location": "New York"}, }, }, ], }, {"role": "tool", "content": '{"temperature": 68}', "tool_call_id": "call_1"}, {"role": "tool", "content": '{"temperature": 55}', "tool_call_id": "call_2"}, ], "tools": [ { "type": "function", "function": { "name": "get_weather", "parameters": { "type": "object", "properties": {"location": {"type": "string"}}, }, }, } ], }, # Thinking tests { "name": "assistant_with_thinking", "messages": [ {"role": "user", "content": "What is 2+2?"}, { "role": "assistant", "content": "The answer is 4.", "thinking": "Let me calculate: 2 + 2 = 4. This is basic arithmetic.", }, {"role": "user", "content": "And 3+3?"}, ], "tools": None, }, { "name": "thinking_with_tool_call", "messages": [ {"role": "user", "content": "What's the weather in Paris?"}, { "role": "assistant", "content": "I'll check the weather for you.", "thinking": "The user wants to know the weather in Paris. I should call the get_weather function.", "tool_calls": [ { "id": "call_1", "type": "function", "function": { "name": "get_weather", "arguments": {"location": "Paris"}, }, } ], }, {"role": "tool", "content": '{"temperature": 18, "condition": "cloudy"}', "tool_call_id": "call_1"}, ], "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get current weather", "parameters": { "type": "object", "properties": {"location": {"type": "string"}}, }, }, } ], }, { "name": "thinking_only_no_content", "messages": [ {"role": "user", "content": "Think about this silently."}, { "role": "assistant", "content": "", # HuggingFace requires content field "thinking": "I'm thinking about this but won't respond with visible content.", }, {"role": "user", "content": "What did you think?"}, ], "tools": None, }, ] # Cache for tokenizers _tokenizer_cache: dict[str, Any] = {} def get_tokenizer(model_name: str): """Get or create tokenizer for the given model.""" if model_name not in _tokenizer_cache: print(f"Loading tokenizer for {model_name}...", file=sys.stderr) _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(model_name) return _tokenizer_cache[model_name] def apply_template( model: str, messages: list[dict], tools: list[dict] | None = None, ) -> str: """Apply HuggingFace chat template to messages.""" tokenizer = get_tokenizer(model) if tools: return tokenizer.apply_chat_template( messages, tools=tools, tokenize=False, add_generation_prompt=True, ) else: return tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) def get_ollama_prompt( ollama_model: str, messages: list[dict], tools: list[dict] | None = None, ollama_host: str = "http://localhost:11434", ) -> str | None: """Get rendered prompt from Ollama using debug_render_only.""" import requests # Convert messages to Ollama format ollama_messages = [] for msg in messages: ollama_msg = {"role": msg["role"]} if "content" in msg: ollama_msg["content"] = msg["content"] if "thinking" in msg: ollama_msg["thinking"] = msg["thinking"] if "tool_calls" in msg: # Convert tool_calls to Ollama format tool_calls = [] for tc in msg["tool_calls"]: tool_call = { "function": { "name": tc["function"]["name"], "arguments": tc["function"]["arguments"], } } if "id" in tc: tool_call["id"] = tc["id"] tool_calls.append(tool_call) ollama_msg["tool_calls"] = tool_calls if "tool_call_id" in msg: ollama_msg["tool_call_id"] = msg["tool_call_id"] ollama_messages.append(ollama_msg) payload = { "model": ollama_model, "messages": ollama_messages, "stream": False, "_debug_render_only": True, } if tools: payload["tools"] = tools try: resp = requests.post(f"{ollama_host}/api/chat", json=payload, timeout=30) resp.raise_for_status() data = resp.json() # Field name is _debug_info with underscore prefix if "_debug_info" in data and "rendered_template" in data["_debug_info"]: return data["_debug_info"]["rendered_template"] return None except requests.exceptions.ConnectionError: print(f" [ERROR] Cannot connect to Ollama at {ollama_host}", file=sys.stderr) return None except Exception as e: print(f" [ERROR] Ollama request failed: {e}", file=sys.stderr) return None def compute_diff(hf_prompt: str, ollama_prompt: str) -> str: """Compute a unified diff between HuggingFace and Ollama prompts.""" import difflib hf_lines = hf_prompt.splitlines(keepends=True) ollama_lines = ollama_prompt.splitlines(keepends=True) diff = difflib.unified_diff( ollama_lines, hf_lines, fromfile="Ollama", tofile="HuggingFace", lineterm="", ) return "".join(diff) def print_test_output( name: str, messages: list[dict], tools: list[dict] | None, hf_prompt: str, ollama_prompt: str | None = None, as_repr: bool = False, ): """Print test output in a format suitable for Go test creation and LLM diffing.""" print(f"\n{'='*60}") print(f"Test: {name}") print("=" * 60) print("\n--- Input Messages ---") print(json.dumps(messages, indent=2)) if tools: print("\n--- Tools ---") print(json.dumps(tools, indent=2)) if ollama_prompt is not None: # Comparison mode if hf_prompt == ollama_prompt: print("\n--- Result: MATCH ---") print("\n--- Prompt (both identical) ---") if as_repr: print(repr(hf_prompt)) else: print(hf_prompt) else: print("\n--- Result: MISMATCH ---") print("\n--- HuggingFace Prompt ---") if as_repr: print(repr(hf_prompt)) else: print(hf_prompt) print("\n--- Ollama Prompt ---") if as_repr: print(repr(ollama_prompt)) else: print(ollama_prompt) print("\n--- Diff (Ollama -> HuggingFace) ---") diff = compute_diff(hf_prompt, ollama_prompt) if diff: print(diff) else: print("(no line-level diff, check whitespace)") else: # HuggingFace only mode print("\n--- HuggingFace Prompt ---") if as_repr: print(repr(hf_prompt)) else: print(hf_prompt) print("=" * 60) def run_tests( model: str, as_repr: bool = False, test_filter: str | None = None, ollama_model: str | None = None, ollama_host: str = "http://localhost:11434", ): """Run all predefined test cases against a model.""" if ollama_model: print(f"\nComparing HuggingFace ({model}) vs Ollama ({ollama_model})\n") else: print(f"\nRunning tests against: {model}\n") matches = 0 mismatches = 0 errors = 0 for test_case in TEST_CASES: name = test_case["name"] messages = test_case["messages"] tools = test_case["tools"] # Filter tests if specified if test_filter and test_filter.lower() not in name.lower(): continue try: hf_prompt = apply_template(model, messages, tools) ollama_prompt = None if ollama_model: ollama_prompt = get_ollama_prompt( ollama_model, messages, tools, ollama_host ) if ollama_prompt is None: errors += 1 elif hf_prompt == ollama_prompt: matches += 1 else: mismatches += 1 print_test_output( name, messages, tools, hf_prompt, ollama_prompt, as_repr=as_repr ) except Exception as e: errors += 1 print(f"\n{'='*60}") print(f"Test: {name} - FAILED") print(f"--- Input Messages ---") print(json.dumps(messages, indent=2)) if tools: print(f"--- Tools ---") print(json.dumps(tools, indent=2)) print(f"--- Error ---") print(f"{e}") print("=" * 60) # Print summary if comparing if ollama_model: total = matches + mismatches + errors print(f"\n{'='*60}") print("SUMMARY") print("=" * 60) print(f" Total: {total}") print(f" Matches: {matches}") print(f" Mismatches: {mismatches}") print(f" Errors: {errors}") print("=" * 60) def show_template(model: str): """Show the chat template for a model.""" tokenizer = get_tokenizer(model) print(f"\nChat template for {model}:\n") print("-" * 60) print(tokenizer.chat_template) print("-" * 60) def start_server(host: str = "0.0.0.0", port: int = 8000): """Start the FastAPI server for manual testing.""" from typing import Optional, List, Dict, Any as TypingAny from fastapi import FastAPI, HTTPException from pydantic import BaseModel import uvicorn class Message(BaseModel): role: str content: Optional[str] = None tool_calls: Optional[List[Dict[str, TypingAny]]] = None tool_call_id: Optional[str] = None class GeneratePromptRequest(BaseModel): messages: List[Message] model: str = "PrimeIntellect/INTELLECT-3" tools: Optional[List[Dict[str, TypingAny]]] = None inject_tools_as_functions: bool = False class GeneratePromptResponse(BaseModel): prompt: str model: str app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0") @app.post("/generate-prompt", response_model=GeneratePromptResponse) async def generate_prompt(request: GeneratePromptRequest): try: messages = [] for msg in request.messages: message_dict = {"role": msg.role} if msg.content is not None: message_dict["content"] = msg.content if msg.tool_calls is not None: tool_calls = [] for tc in msg.tool_calls: tc_copy = tc.copy() if "function" in tc_copy and "arguments" in tc_copy["function"]: args = tc_copy["function"]["arguments"] if isinstance(args, str): try: tc_copy["function"]["arguments"] = json.loads(args) except json.JSONDecodeError: pass tool_calls.append(tc_copy) message_dict["tool_calls"] = tool_calls if msg.tool_call_id is not None: message_dict["tool_call_id"] = msg.tool_call_id messages.append(message_dict) prompt = apply_template(request.model, messages, request.tools) return GeneratePromptResponse(prompt=prompt, model=request.model) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health_check(): return {"status": "healthy"} print(f"Starting server on http://{host}:{port}") print("Endpoints:") print(" POST /generate-prompt - Generate prompt from messages") print(" GET /health - Health check") uvicorn.run(app, host=host, port=port) def main(): parser = argparse.ArgumentParser( description="HuggingFace Prompt Testing Tool", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--model", "-m", type=str, help="HuggingFace model name (e.g., PrimeIntellect/INTELLECT-3)", ) parser.add_argument( "--ollama-model", "-o", type=str, help="Ollama model name to compare against (e.g., qwen3-coder)", ) parser.add_argument( "--ollama-host", type=str, default="http://localhost:11434", help="Ollama server URL (default: http://localhost:11434)", ) parser.add_argument( "--serve", "-s", action="store_true", help="Start FastAPI server for manual curl testing", ) parser.add_argument( "--port", "-p", type=int, default=8000, help="Server port (default: 8000)", ) parser.add_argument( "--show-template", "-t", action="store_true", help="Show the chat template for the model", ) parser.add_argument( "--repr", "-r", action="store_true", help="Output prompts as Python repr (shows escape sequences)", ) parser.add_argument( "--filter", "-f", type=str, help="Filter tests by name (substring match)", ) args = parser.parse_args() if args.serve: start_server(port=args.port) elif args.model: if args.show_template: show_template(args.model) else: run_tests( args.model, as_repr=args.repr, test_filter=args.filter, ollama_model=args.ollama_model, ollama_host=args.ollama_host, ) else: parser.print_help() print("\nExample usage:") print(" uv run cmd/chat_template/chat_template.py --model PrimeIntellect/INTELLECT-3") print(" uv run cmd/chat_template/chat_template.py --model Qwen/Qwen3-Coder-480B-A35B-Instruct --ollama-model qwen3-coder") print(" uv run cmd/chat_template/chat_template.py --serve") sys.exit(1) if __name__ == "__main__": main()