312 lines
11 KiB
Python
312 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = [
|
|
# "fastapi",
|
|
# "uvicorn",
|
|
# "transformers",
|
|
# "jinja2",
|
|
# "mcp",
|
|
# ]
|
|
# ///
|
|
"""
|
|
HuggingFace Prompt Renderer MCP Server
|
|
|
|
Model Context Protocol (MCP) server for rendering conversation messages into
|
|
model-specific prompt strings using HuggingFace tokenizer chat templates.
|
|
|
|
Usage:
|
|
# Run MCP server over stdio
|
|
uv run cmd/prompt-rendering/server.py --mcp
|
|
|
|
# Start FastAPI server for manual testing
|
|
uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
|
|
|
|
# Test with curl
|
|
curl -X POST http://localhost:8000/generate-prompt \\
|
|
-H "Content-Type: application/json" \\
|
|
-d '{"messages": [{"role": "user", "content": "Hello!"}]}'
|
|
"""
|
|
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
import argparse
|
|
import json
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel
|
|
import uvicorn
|
|
from transformers import AutoTokenizer
|
|
|
|
try:
|
|
from mcp.server.fastmcp import FastMCP
|
|
except Exception:
|
|
FastMCP = None
|
|
|
|
# Cache for tokenizers to avoid reloading
|
|
_tokenizer_cache: Dict[str, Any] = {}
|
|
|
|
|
|
class Message(BaseModel):
|
|
role: str
|
|
content: Optional[str] = None
|
|
tool_calls: Optional[List[Dict[str, Any]]] = None
|
|
tool_call_id: Optional[str] = None
|
|
functions: Optional[str] = None # For OLMo-style function passing
|
|
function_calls: Optional[str] = None # For OLMo-style function call results
|
|
|
|
|
|
class GeneratePromptRequest(BaseModel):
|
|
messages: List[Message]
|
|
model: str
|
|
tools: Optional[List[Dict[str, Any]]] = None
|
|
# Whether to inject tools into system message as 'functions' key (for OLMo-style templates)
|
|
inject_tools_as_functions: Optional[bool] = True
|
|
|
|
|
|
class GeneratePromptResponse(BaseModel):
|
|
prompt: str
|
|
model: str
|
|
|
|
|
|
# FastAPI app
|
|
app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0")
|
|
|
|
|
|
def get_tokenizer(model_name: str) -> Any:
|
|
"""Get or create tokenizer for the given model."""
|
|
if model_name not in _tokenizer_cache:
|
|
_tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
|
|
model_name, trust_remote_code=True
|
|
)
|
|
return _tokenizer_cache[model_name]
|
|
|
|
|
|
def is_deepseek_model(model_name: str) -> bool:
|
|
"""Check if this is a DeepSeek model."""
|
|
return "deepseek" in model_name.lower()
|
|
|
|
|
|
def normalize_messages(
|
|
raw_messages: List[Any],
|
|
tools: Optional[List[Dict[str, Any]]],
|
|
inject_tools_as_functions: bool,
|
|
model: str,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Normalize messages for different chat template formats."""
|
|
messages: List[Dict[str, Any]] = []
|
|
tools_json = json.dumps(tools) if tools else None
|
|
is_deepseek = is_deepseek_model(model)
|
|
|
|
for msg in raw_messages:
|
|
message = msg if isinstance(msg, Message) else Message(**msg)
|
|
message_dict: Dict[str, Any] = {"role": message.role, "content": None}
|
|
|
|
if message.content is not None:
|
|
message_dict["content"] = message.content
|
|
|
|
# Handle explicit functions field (OLMo-style)
|
|
if message.functions is not None:
|
|
message_dict["functions"] = message.functions
|
|
# Inject tools into system message as 'functions' (for OLMo templates)
|
|
elif inject_tools_as_functions and message.role == "system" and tools_json:
|
|
message_dict["functions"] = tools_json
|
|
|
|
# Handle explicit function_calls field (OLMo-style)
|
|
if message.function_calls is not None:
|
|
message_dict["function_calls"] = message.function_calls
|
|
# Convert tool_calls for templates
|
|
elif message.tool_calls is not None:
|
|
if is_deepseek:
|
|
# DeepSeek format: arguments must be a JSON string
|
|
tool_calls = []
|
|
for tool_call in message.tool_calls:
|
|
tc = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": tool_call["function"]["name"],
|
|
"arguments": json.dumps(tool_call["function"]["arguments"])
|
|
if isinstance(tool_call["function"]["arguments"], dict)
|
|
else tool_call["function"]["arguments"],
|
|
},
|
|
}
|
|
tool_calls.append(tc)
|
|
message_dict["tool_calls"] = tool_calls
|
|
elif inject_tools_as_functions:
|
|
# Convert to OLMo function_calls format
|
|
message_dict["function_calls"] = json.dumps(message.tool_calls)
|
|
else:
|
|
# Standard transformers format
|
|
tool_calls = []
|
|
for tool_call in message.tool_calls:
|
|
tool_call_copy = tool_call.copy()
|
|
if (
|
|
"function" in tool_call_copy
|
|
and "arguments" in tool_call_copy["function"]
|
|
):
|
|
try:
|
|
tool_call_copy["function"]["arguments"] = json.loads(
|
|
tool_call_copy["function"]["arguments"]
|
|
)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
tool_calls.append(tool_call_copy)
|
|
message_dict["tool_calls"] = tool_calls
|
|
|
|
if message.tool_call_id is not None:
|
|
message_dict["tool_call_id"] = message.tool_call_id
|
|
|
|
messages.append(message_dict)
|
|
|
|
return messages
|
|
|
|
|
|
def build_prompt(
|
|
raw_messages: List[Any],
|
|
model: str,
|
|
tools: Optional[List[Dict[str, Any]]],
|
|
inject_tools_as_functions: bool,
|
|
) -> str:
|
|
"""Build prompt from messages using the model's chat template."""
|
|
messages = normalize_messages(
|
|
raw_messages=raw_messages,
|
|
tools=tools,
|
|
inject_tools_as_functions=inject_tools_as_functions,
|
|
model=model,
|
|
)
|
|
|
|
tokenizer = get_tokenizer(model)
|
|
|
|
# For OLMo-style templates, don't pass tools separately (they're in messages)
|
|
if tools and not inject_tools_as_functions:
|
|
prompt = tokenizer.apply_chat_template(
|
|
messages,
|
|
tools=tools,
|
|
tokenize=False,
|
|
add_generation_prompt=True,
|
|
)
|
|
else:
|
|
prompt = tokenizer.apply_chat_template(
|
|
messages,
|
|
tokenize=False,
|
|
add_generation_prompt=True,
|
|
)
|
|
|
|
return prompt
|
|
|
|
|
|
@app.post("/generate-prompt", response_model=GeneratePromptResponse)
|
|
async def generate_prompt(request: GeneratePromptRequest):
|
|
"""
|
|
Generate a prompt from messages using the specified model's chat template.
|
|
Optionally includes tool definitions if provided.
|
|
"""
|
|
try:
|
|
prompt = build_prompt(
|
|
raw_messages=request.messages,
|
|
model=request.model,
|
|
tools=request.tools,
|
|
inject_tools_as_functions=request.inject_tools_as_functions,
|
|
)
|
|
return GeneratePromptResponse(prompt=prompt, model=request.model)
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Failed to generate prompt: {str(e)}",
|
|
)
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint."""
|
|
return {"status": "healthy"}
|
|
|
|
|
|
if FastMCP is not None:
|
|
mcp = FastMCP("huggingface-prompt-renderer")
|
|
|
|
@mcp.tool()
|
|
def generate_prompt_tool(
|
|
messages: List[Dict[str, Any]],
|
|
model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct",
|
|
tools: Optional[List[Dict[str, Any]]] = None,
|
|
inject_tools_as_functions: bool = True,
|
|
) -> Dict[str, str]:
|
|
"""
|
|
Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates.
|
|
|
|
This tool takes a list of message objects and applies the target model's chat template to produce
|
|
the exact prompt string that would be fed to the model. It handles various message formats including
|
|
standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting.
|
|
|
|
Use this tool to:
|
|
- Verify that a model's chat template correctly formats your conversation
|
|
- Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response
|
|
- Compare prompt output across different models to understand template differences
|
|
- Debug issues with message formatting that cause unexpected model behavior
|
|
|
|
Message format supports:
|
|
- role: "user", "assistant", "system", "tool"
|
|
- content: string content of the message
|
|
- tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}})
|
|
- tool_call_id: for tool role messages, references the call being responded to
|
|
- functions: optional field for OLMo-style tool definitions
|
|
- function_calls: optional field for OLMo-style tool call results
|
|
|
|
Parameters:
|
|
- messages: List of message dictionaries forming the conversation
|
|
- model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
|
|
- tools: Optional list of tool/function definitions for function calling models
|
|
- inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template.
|
|
|
|
Returns: Dictionary with 'prompt' (rendered string) and 'model' keys.
|
|
|
|
Recommended test cases:
|
|
1. Simple conversation: user -> assistant
|
|
2. Tool calling: user -> assistant with tool_call -> tool response -> assistant
|
|
3. Multiple tool calls in one assistant message
|
|
4. Multiple tool responses interleaved with assistant reasoning
|
|
5. Nested tool calls (assistant calls tool, uses result to call another)
|
|
6. System message with tool definitions
|
|
7. Empty or None content in messages
|
|
8. Very long messages to test truncation handling
|
|
"""
|
|
prompt = build_prompt(
|
|
raw_messages=messages,
|
|
model=model,
|
|
tools=tools,
|
|
inject_tools_as_functions=inject_tools_as_functions,
|
|
)
|
|
return {"prompt": prompt, "model": model}
|
|
else:
|
|
mcp = None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="HuggingFace Prompt Renderer MCP Server",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
parser.add_argument(
|
|
"--mcp", action="store_true", help="Run MCP server over stdio"
|
|
)
|
|
parser.add_argument("--host", default="0.0.0.0", help="FastAPI host")
|
|
parser.add_argument("--port", type=int, default=8000, help="FastAPI port")
|
|
args = parser.parse_args()
|
|
|
|
if args.mcp:
|
|
if mcp is None:
|
|
raise RuntimeError("MCP server requested but mcp is not installed.")
|
|
mcp.run()
|
|
else:
|
|
uvicorn.run(app, host=args.host, port=args.port)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|