ollama/cmd/prompt-rendering/server.py

#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
#   "fastapi",
#   "uvicorn",
#   "transformers",
#   "jinja2",
#   "mcp",
# ]
# ///
"""
HuggingFace Prompt Renderer MCP Server

Model Context Protocol (MCP) server for rendering conversation messages into
model-specific prompt strings using HuggingFace tokenizer chat templates.

Usage:
    # Run MCP server over stdio
    uv run cmd/prompt-rendering/server.py --mcp

    # Start FastAPI server for manual testing
    uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000

    # Test with curl
    curl -X POST http://localhost:8000/generate-prompt \\
      -H "Content-Type: application/json" \\
      -d '{"messages": [{"role": "user", "content": "Hello!"}]}'
"""

from typing import Any, Dict, List, Optional

import argparse
import json

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
from transformers import AutoTokenizer

try:
    from mcp.server.fastmcp import FastMCP
except Exception:
    FastMCP = None

# Cache for tokenizers to avoid reloading
_tokenizer_cache: Dict[str, Any] = {}


class Message(BaseModel):
    role: str
    content: Optional[str] = None
    tool_calls: Optional[List[Dict[str, Any]]] = None
    tool_call_id: Optional[str] = None
    functions: Optional[str] = None  # For OLMo-style function passing
    function_calls: Optional[str] = None  # For OLMo-style function call results


class GeneratePromptRequest(BaseModel):
    messages: List[Message]
    model: str
    tools: Optional[List[Dict[str, Any]]] = None
    # Whether to inject tools into system message as 'functions' key (for OLMo-style templates)
    inject_tools_as_functions: Optional[bool] = True


class GeneratePromptResponse(BaseModel):
    prompt: str
    model: str


# FastAPI app
app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0")


def get_tokenizer(model_name: str) -> Any:
    """Get or create tokenizer for the given model."""
    if model_name not in _tokenizer_cache:
        _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
            model_name, trust_remote_code=True
        )
    return _tokenizer_cache[model_name]


def is_deepseek_model(model_name: str) -> bool:
    """Check if this is a DeepSeek model."""
    return "deepseek" in model_name.lower()


def normalize_messages(
    raw_messages: List[Any],
    tools: Optional[List[Dict[str, Any]]],
    inject_tools_as_functions: bool,
    model: str,
) -> List[Dict[str, Any]]:
    """Normalize messages for different chat template formats."""
    messages: List[Dict[str, Any]] = []
    tools_json = json.dumps(tools) if tools else None
    is_deepseek = is_deepseek_model(model)

    for msg in raw_messages:
        message = msg if isinstance(msg, Message) else Message(**msg)
        message_dict: Dict[str, Any] = {"role": message.role, "content": None}

        if message.content is not None:
            message_dict["content"] = message.content

        # Handle explicit functions field (OLMo-style)
        if message.functions is not None:
            message_dict["functions"] = message.functions
        # Inject tools into system message as 'functions' (for OLMo templates)
        elif inject_tools_as_functions and message.role == "system" and tools_json:
            message_dict["functions"] = tools_json

        # Handle explicit function_calls field (OLMo-style)
        if message.function_calls is not None:
            message_dict["function_calls"] = message.function_calls
        # Convert tool_calls for templates
        elif message.tool_calls is not None:
            if is_deepseek:
                # DeepSeek format: arguments must be a JSON string
                tool_calls = []
                for tool_call in message.tool_calls:
                    tc = {
                        "type": "function",
                        "function": {
                            "name": tool_call["function"]["name"],
                            "arguments": json.dumps(tool_call["function"]["arguments"])
                            if isinstance(tool_call["function"]["arguments"], dict)
                            else tool_call["function"]["arguments"],
                        },
                    }
                    tool_calls.append(tc)
                message_dict["tool_calls"] = tool_calls
            elif inject_tools_as_functions:
                # Convert to OLMo function_calls format
                message_dict["function_calls"] = json.dumps(message.tool_calls)
            else:
                # Standard transformers format
                tool_calls = []
                for tool_call in message.tool_calls:
                    tool_call_copy = tool_call.copy()
                    if (
                        "function" in tool_call_copy
                        and "arguments" in tool_call_copy["function"]
                    ):
                        try:
                            tool_call_copy["function"]["arguments"] = json.loads(
                                tool_call_copy["function"]["arguments"]
                            )
                        except (json.JSONDecodeError, TypeError):
                            pass
                    tool_calls.append(tool_call_copy)
                message_dict["tool_calls"] = tool_calls

        if message.tool_call_id is not None:
            message_dict["tool_call_id"] = message.tool_call_id

        messages.append(message_dict)

    return messages


def build_prompt(
    raw_messages: List[Any],
    model: str,
    tools: Optional[List[Dict[str, Any]]],
    inject_tools_as_functions: bool,
) -> str:
    """Build prompt from messages using the model's chat template."""
    messages = normalize_messages(
        raw_messages=raw_messages,
        tools=tools,
        inject_tools_as_functions=inject_tools_as_functions,
        model=model,
    )

    tokenizer = get_tokenizer(model)

    # For OLMo-style templates, don't pass tools separately (they're in messages)
    if tools and not inject_tools_as_functions:
        prompt = tokenizer.apply_chat_template(
            messages,
            tools=tools,
            tokenize=False,
            add_generation_prompt=True,
        )
    else:
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )

    return prompt


@app.post("/generate-prompt", response_model=GeneratePromptResponse)
async def generate_prompt(request: GeneratePromptRequest):
    """
    Generate a prompt from messages using the specified model's chat template.
    Optionally includes tool definitions if provided.
    """
    try:
        prompt = build_prompt(
            raw_messages=request.messages,
            model=request.model,
            tools=request.tools,
            inject_tools_as_functions=request.inject_tools_as_functions,
        )
        return GeneratePromptResponse(prompt=prompt, model=request.model)

    except Exception as e:
        import traceback

        traceback.print_exc()
        raise HTTPException(
            status_code=500,
            detail=f"Failed to generate prompt: {str(e)}",
        )


@app.get("/health")
async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}


if FastMCP is not None:
    mcp = FastMCP("huggingface-prompt-renderer")

    @mcp.tool()
    def generate_prompt_tool(
        messages: List[Dict[str, Any]],
        model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct",
        tools: Optional[List[Dict[str, Any]]] = None,
        inject_tools_as_functions: bool = True,
    ) -> Dict[str, str]:
        """
        Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates.

        This tool takes a list of message objects and applies the target model's chat template to produce
        the exact prompt string that would be fed to the model. It handles various message formats including
        standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting.

        Use this tool to:
        - Verify that a model's chat template correctly formats your conversation
        - Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response
        - Compare prompt output across different models to understand template differences
        - Debug issues with message formatting that cause unexpected model behavior

        Message format supports:
        - role: "user", "assistant", "system", "tool"
        - content: string content of the message
        - tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}})
        - tool_call_id: for tool role messages, references the call being responded to
        - functions: optional field for OLMo-style tool definitions
        - function_calls: optional field for OLMo-style tool call results

        Parameters:
        - messages: List of message dictionaries forming the conversation
        - model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
        - tools: Optional list of tool/function definitions for function calling models
        - inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template.

        Returns: Dictionary with 'prompt' (rendered string) and 'model' keys.

        Recommended test cases:
        1. Simple conversation: user -> assistant
        2. Tool calling: user -> assistant with tool_call -> tool response -> assistant
        3. Multiple tool calls in one assistant message
        4. Multiple tool responses interleaved with assistant reasoning
        5. Nested tool calls (assistant calls tool, uses result to call another)
        6. System message with tool definitions
        7. Empty or None content in messages
        8. Very long messages to test truncation handling
        """
        prompt = build_prompt(
            raw_messages=messages,
            model=model,
            tools=tools,
            inject_tools_as_functions=inject_tools_as_functions,
        )
        return {"prompt": prompt, "model": model}
else:
    mcp = None


def main():
    parser = argparse.ArgumentParser(
        description="HuggingFace Prompt Renderer MCP Server",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument(
        "--mcp", action="store_true", help="Run MCP server over stdio"
    )
    parser.add_argument("--host", default="0.0.0.0", help="FastAPI host")
    parser.add_argument("--port", type=int, default=8000, help="FastAPI port")
    args = parser.parse_args()

    if args.mcp:
        if mcp is None:
            raise RuntimeError("MCP server requested but mcp is not installed.")
        mcp.run()
    else:
        uvicorn.run(app, host=args.host, port=args.port)


if __name__ == "__main__":
    main()