diff --git a/cmd/prompt-rendering/README.md b/cmd/prompt-rendering/README.md new file mode 100644 index 000000000..c13d73dcd --- /dev/null +++ b/cmd/prompt-rendering/README.md @@ -0,0 +1,156 @@ +# HuggingFace Prompt Renderer MCP Server + +Model Context Protocol (MCP) server for rendering conversation messages into +model-specific prompt strings using HuggingFace tokenizer chat templates. + +## Requirements + +- [uv](https://docs.astral.sh/uv/) - Fast Python package installer + +## Usage + +### MCP Server Mode + +Run the MCP server over stdio for use with MCP clients: + +```bash +uv run cmd/prompt-rendering/server.py --mcp +``` + +Add to your MCP client configuration (e.g., for Claude Desktop): + +```json +{ + "mcpServers": { + "huggingface-prompt-renderer": { + "command": "uv", + "args": [ + "run", + "--directory", + "", + "cmd/prompt-rendering/server.py", + "--mcp" + ] + } + } +} +``` + +### FastAPI Server Mode + +Start a FastAPI server for manual HTTP testing: + +```bash +# Start on default port 8000 +uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000 + +# Start on custom port +uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 9000 +``` + +#### Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| POST | `/generate-prompt` | Generate prompt from messages | +| GET | `/health` | Health check | + +### Test with curl + +```bash +# Basic user message +curl -X POST http://localhost:8000/generate-prompt \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "Hello!"}] + }' + +# With tools +curl -X POST http://localhost:8000/generate-prompt \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the weather?"} + ], + "model": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "required": ["location"], + "properties": { + "location": {"type": "string", "description": "The city"} + } + } + } + }] + }' + +# With tool calls +curl -X POST http://localhost:8000/generate-prompt \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "user", "content": "What is the weather in SF?"}, + { + "role": "assistant", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": { + "name": "get_weather", + "arguments": {"location": "San Francisco"} + } + }] + }, + {"role": "tool", "content": "{\"temperature\": 68}", "tool_call_id": "call_1"} + ], + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}} + } + } + }] + }' +``` + +## Supported Message Formats + +The server supports multiple message formats: + +| Format | Description | +|--------|-------------| +| OpenAI | Standard `role`, `content`, `tool_calls`, `tool_call_id` | +| OLMo | Adds `functions` and `function_calls` fields | +| DeepSeek | Tool call arguments must be JSON strings | + +## Tool Support + +| Setting | Description | +|---------|-------------| +| `inject_tools_as_functions=true` | Injects tools into system message as `functions` key (OLMo-style) | +| `inject_tools_as_functions=false` | Passes tools separately to `apply_chat_template` (standard transformers) | + +## Models + +The server uses HuggingFace's `transformers` library and supports any model +with a chat template. Default: `Qwen/Qwen3-Coder-480B-A35B-Instruct` + +## Dependencies + +The script uses PEP 723 inline dependency metadata. When run with `uv`, +dependencies are automatically installed into an isolated environment: + +- `fastapi` - Web framework +- `uvicorn` - ASGI server +- `transformers` - HuggingFace tokenizer +- `jinja2` - Template engine +- `mcp` - Model Context Protocol diff --git a/cmd/prompt-rendering/server.py b/cmd/prompt-rendering/server.py new file mode 100644 index 000000000..e9420e22e --- /dev/null +++ b/cmd/prompt-rendering/server.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "fastapi", +# "uvicorn", +# "transformers", +# "jinja2", +# "mcp", +# ] +# /// +""" +HuggingFace Prompt Renderer MCP Server + +Model Context Protocol (MCP) server for rendering conversation messages into +model-specific prompt strings using HuggingFace tokenizer chat templates. + +Usage: + # Run MCP server over stdio + uv run cmd/prompt-rendering/server.py --mcp + + # Start FastAPI server for manual testing + uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000 + + # Test with curl + curl -X POST http://localhost:8000/generate-prompt \\ + -H "Content-Type: application/json" \\ + -d '{"messages": [{"role": "user", "content": "Hello!"}]}' +""" + +from typing import Any, Dict, List, Optional + +import argparse +import json + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import uvicorn +from transformers import AutoTokenizer + +try: + from mcp.server.fastmcp import FastMCP +except Exception: + FastMCP = None + +# Cache for tokenizers to avoid reloading +_tokenizer_cache: Dict[str, Any] = {} + + +class Message(BaseModel): + role: str + content: Optional[str] = None + tool_calls: Optional[List[Dict[str, Any]]] = None + tool_call_id: Optional[str] = None + functions: Optional[str] = None # For OLMo-style function passing + function_calls: Optional[str] = None # For OLMo-style function call results + + +class GeneratePromptRequest(BaseModel): + messages: List[Message] + model: str + tools: Optional[List[Dict[str, Any]]] = None + # Whether to inject tools into system message as 'functions' key (for OLMo-style templates) + inject_tools_as_functions: Optional[bool] = True + + +class GeneratePromptResponse(BaseModel): + prompt: str + model: str + + +# FastAPI app +app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0") + + +def get_tokenizer(model_name: str) -> Any: + """Get or create tokenizer for the given model.""" + if model_name not in _tokenizer_cache: + _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained( + model_name, trust_remote_code=True + ) + return _tokenizer_cache[model_name] + + +def is_deepseek_model(model_name: str) -> bool: + """Check if this is a DeepSeek model.""" + return "deepseek" in model_name.lower() + + +def normalize_messages( + raw_messages: List[Any], + tools: Optional[List[Dict[str, Any]]], + inject_tools_as_functions: bool, + model: str, +) -> List[Dict[str, Any]]: + """Normalize messages for different chat template formats.""" + messages: List[Dict[str, Any]] = [] + tools_json = json.dumps(tools) if tools else None + is_deepseek = is_deepseek_model(model) + + for msg in raw_messages: + message = msg if isinstance(msg, Message) else Message(**msg) + message_dict: Dict[str, Any] = {"role": message.role, "content": None} + + if message.content is not None: + message_dict["content"] = message.content + + # Handle explicit functions field (OLMo-style) + if message.functions is not None: + message_dict["functions"] = message.functions + # Inject tools into system message as 'functions' (for OLMo templates) + elif inject_tools_as_functions and message.role == "system" and tools_json: + message_dict["functions"] = tools_json + + # Handle explicit function_calls field (OLMo-style) + if message.function_calls is not None: + message_dict["function_calls"] = message.function_calls + # Convert tool_calls for templates + elif message.tool_calls is not None: + if is_deepseek: + # DeepSeek format: arguments must be a JSON string + tool_calls = [] + for tool_call in message.tool_calls: + tc = { + "type": "function", + "function": { + "name": tool_call["function"]["name"], + "arguments": json.dumps(tool_call["function"]["arguments"]) + if isinstance(tool_call["function"]["arguments"], dict) + else tool_call["function"]["arguments"], + }, + } + tool_calls.append(tc) + message_dict["tool_calls"] = tool_calls + elif inject_tools_as_functions: + # Convert to OLMo function_calls format + message_dict["function_calls"] = json.dumps(message.tool_calls) + else: + # Standard transformers format + tool_calls = [] + for tool_call in message.tool_calls: + tool_call_copy = tool_call.copy() + if ( + "function" in tool_call_copy + and "arguments" in tool_call_copy["function"] + ): + try: + tool_call_copy["function"]["arguments"] = json.loads( + tool_call_copy["function"]["arguments"] + ) + except (json.JSONDecodeError, TypeError): + pass + tool_calls.append(tool_call_copy) + message_dict["tool_calls"] = tool_calls + + if message.tool_call_id is not None: + message_dict["tool_call_id"] = message.tool_call_id + + messages.append(message_dict) + + return messages + + +def build_prompt( + raw_messages: List[Any], + model: str, + tools: Optional[List[Dict[str, Any]]], + inject_tools_as_functions: bool, +) -> str: + """Build prompt from messages using the model's chat template.""" + messages = normalize_messages( + raw_messages=raw_messages, + tools=tools, + inject_tools_as_functions=inject_tools_as_functions, + model=model, + ) + + tokenizer = get_tokenizer(model) + + # For OLMo-style templates, don't pass tools separately (they're in messages) + if tools and not inject_tools_as_functions: + prompt = tokenizer.apply_chat_template( + messages, + tools=tools, + tokenize=False, + add_generation_prompt=True, + ) + else: + prompt = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + return prompt + + +@app.post("/generate-prompt", response_model=GeneratePromptResponse) +async def generate_prompt(request: GeneratePromptRequest): + """ + Generate a prompt from messages using the specified model's chat template. + Optionally includes tool definitions if provided. + """ + try: + prompt = build_prompt( + raw_messages=request.messages, + model=request.model, + tools=request.tools, + inject_tools_as_functions=request.inject_tools_as_functions, + ) + return GeneratePromptResponse(prompt=prompt, model=request.model) + + except Exception as e: + import traceback + + traceback.print_exc() + raise HTTPException( + status_code=500, + detail=f"Failed to generate prompt: {str(e)}", + ) + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return {"status": "healthy"} + + +if FastMCP is not None: + mcp = FastMCP("huggingface-prompt-renderer") + + @mcp.tool() + def generate_prompt_tool( + messages: List[Dict[str, Any]], + model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct", + tools: Optional[List[Dict[str, Any]]] = None, + inject_tools_as_functions: bool = True, + ) -> Dict[str, str]: + """ + Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates. + + This tool takes a list of message objects and applies the target model's chat template to produce + the exact prompt string that would be fed to the model. It handles various message formats including + standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting. + + Use this tool to: + - Verify that a model's chat template correctly formats your conversation + - Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response + - Compare prompt output across different models to understand template differences + - Debug issues with message formatting that cause unexpected model behavior + + Message format supports: + - role: "user", "assistant", "system", "tool" + - content: string content of the message + - tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}}) + - tool_call_id: for tool role messages, references the call being responded to + - functions: optional field for OLMo-style tool definitions + - function_calls: optional field for OLMo-style tool call results + + Parameters: + - messages: List of message dictionaries forming the conversation + - model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct) + - tools: Optional list of tool/function definitions for function calling models + - inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template. + + Returns: Dictionary with 'prompt' (rendered string) and 'model' keys. + + Recommended test cases: + 1. Simple conversation: user -> assistant + 2. Tool calling: user -> assistant with tool_call -> tool response -> assistant + 3. Multiple tool calls in one assistant message + 4. Multiple tool responses interleaved with assistant reasoning + 5. Nested tool calls (assistant calls tool, uses result to call another) + 6. System message with tool definitions + 7. Empty or None content in messages + 8. Very long messages to test truncation handling + """ + prompt = build_prompt( + raw_messages=messages, + model=model, + tools=tools, + inject_tools_as_functions=inject_tools_as_functions, + ) + return {"prompt": prompt, "model": model} +else: + mcp = None + + +def main(): + parser = argparse.ArgumentParser( + description="HuggingFace Prompt Renderer MCP Server", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--mcp", action="store_true", help="Run MCP server over stdio" + ) + parser.add_argument("--host", default="0.0.0.0", help="FastAPI host") + parser.add_argument("--port", type=int, default=8000, help="FastAPI port") + args = parser.parse_args() + + if args.mcp: + if mcp is None: + raise RuntimeError("MCP server requested but mcp is not installed.") + mcp.run() + else: + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": + main()