大模型API开发实战:从基础调用到生产级流式服务完整指南
本文覆盖 OpenAI、Claude、Gemini 三大主流大模型 API 的完整开发链路,包含 Python 实战代码、错误处理、流式输出、Function Calling、多模型路由等生产级方案。
一、为什么你需要掌握大模型API开发?
2026年的AI应用开发早已不是"调个接口拿回复"那么简单。生产级应用需要面对:高并发下的Token成本控制、流式响应的用户体验、多模型故障转移、结构化输出校验、敏感内容过滤等一系列工程挑战。
本文将从最基础的HTTP调用开始,逐步构建一个支持多厂商、具备流式输出、自动重试、日志追踪的生产级大模型API客户端。所有代码可直接复制运行。
二、环境准备与基础调用
2.1 安装依赖
pip install openai anthropic google-generativeai httpx tenacity python-dotenv
2.2 统一配置管理
import os
from dataclasses import dataclass
from dotenv import load_dotenv
load_dotenv()
@dataclass
class LLMConfig:
openai_key: str = os.getenv("OPENAI_API_KEY", "")
anthropic_key: str = os.getenv("ANTHROPIC_API_KEY", "")
gemini_key: str = os.getenv("GEMINI_API_KEY", "")
openai_base: str = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
anthropic_base: str = os.getenv("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
timeout: int = 60
max_retries: int = 3
config = LLMConfig()
2.3 OpenAI API 基础调用
from openai import OpenAI
client = OpenAI(api_key=config.openai_key, base_url=config.openai_base)
def chat_with_openai(prompt: str, model: str = "gpt-4o") -> str:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "你是一个专业的技术助手,回答要简洁准确。"},
{"role": "user", "content": prompt},
],
temperature=0.3,
max_tokens=2048,
)
return response.choices[0].message.content
if __name__ == "__main__":
result = chat_with_openai("解释一下Python中的asyncio事件循环")
print(result)
2.4 Claude API 基础调用
import anthropic
claude_client = anthropic.Anthropic(
api_key=config.anthropic_key,
base_url=config.anthropic_base,
)
def chat_with_claude(prompt: str, model: str = "claude-3-5-sonnet-20241022") -> str:
response = claude_client.messages.create(
model=model,
max_tokens=4096,
system="你是一个专业的技术助手,回答要简洁准确。",
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
三、流式输出(Streaming)实战
流式输出是大模型API开发中最重要的用户体验优化手段。它能让用户立即看到第一个字,而不是等待整段生成完毕。
3.1 OpenAI 流式输出
def stream_chat_openai(prompt: str, model: str = "gpt-4o"):
stream = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
stream=True,
temperature=0.3,
)
full_response = ""
for chunk in stream:
if chunk.choices[0].delta.content:
token = chunk.choices[0].delta.content
full_response += token
print(token, end="", flush=True)
print()
return full_response
3.2 Claude 流式输出
def stream_chat_claude(prompt: str, model: str = "claude-3-5-sonnet-20241022"):
with claude_client.messages.stream(
model=model,
max_tokens=4096,
messages=[{"role": "user", "content": prompt}],
) as stream:
full_response = ""
for text in stream.text_stream:
full_response += text
print(text, end="", flush=True)
print()
return full_response
3.3 生产级流式封装(支持SSE推送)
from flask import Flask, Response, request
import json
app = Flask(__name__)
@app.route("/chat/stream", methods=["POST"])
def chat_stream():
data = request.json
prompt = data.get("prompt", "")
model = data.get("model", "gpt-4o")
def generate():
try:
stream = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
stream=True,
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield f"data: {json.dumps({'token': chunk.choices[0].delta.content}, ensure_ascii=False)}\n\n"
yield f"data: {json.dumps({'done': True}, ensure_ascii=False)}\n\n"
except Exception as e:
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
return Response(generate(), mimetype="text/event-stream")
if __name__ == "__main__":
app.run(debug=True, port=5000)
前端调用示例:
const eventSource = new EventSource('/chat/stream');
eventSource.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data.done) {
eventSource.close();
} else {
appendToChat(data.token);
}
};
四、Function Calling:让大模型调用外部工具
Function Calling 是大模型API最革命性的能力之一,它让模型能够"使用工具"完成复杂任务。
4.1 定义工具函数
tools = [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "获取指定城市的当前天气",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "城市名称,如北京、上海",
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "温度单位",
},
},
"required": ["city"],
},
},
},
{
"type": "function",
"function": {
"name": "search_database",
"description": "在商品数据库中搜索商品",
"parameters": {
"type": "object",
"properties": {
"keyword": {"type": "string", "description": "搜索关键词"},
"category": {"type": "string", "description": "商品分类"},
},
"required": ["keyword"],
},
},
},
]
4.2 完整Function Calling流程
import json
def get_weather(city: str, unit: str = "celsius") -> str:
# 模拟天气API调用
weather_db = {
"北京": {"temp": 28, "condition": "晴"},
"上海": {"temp": 30, "condition": "多云"},
"广州": {"temp": 33, "condition": "雷阵雨"},
}
info = weather_db.get(city, {"temp": 25, "condition": "未知"})
return f"{city}当前天气:{info['condition']},温度{info['temp']}°{unit[0].upper()}"
def search_database(keyword: str, category: str = "") -> str:
# 模拟数据库搜索
return json.dumps([{"name": f"{keyword}商品A", "price": 99}, {"name": f"{keyword}商品B", "price": 199}])
def agent_chat(user_input: str) -> str:
messages = [{"role": "user", "content": user_input}]
# 第一轮:让模型判断是否需要调用工具
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=tools,
tool_choice="auto",
)
assistant_message = response.choices[0].message
# 如果模型决定调用工具
if assistant_message.tool_calls:
messages.append({
"role": "assistant",
"content": assistant_message.content or "",
"tool_calls": [tc.model_dump() for tc in assistant_message.tool_calls],
})
for tool_call in assistant_message.tool_calls:
function_name = tool_call.function.name
function_args = json.loads(tool_call.function.arguments)
if function_name == "get_weather":
result = get_weather(**function_args)
elif function_name == "search_database":
result = search_database(**function_args)
else:
result = "未知工具"
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": result,
})
# 第二轮:将工具结果返回给模型生成最终回复
final_response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
)
return final_response.choices[0].message.content
return assistant_message.content
# 测试
print(agent_chat("帮我查一下广州今天的天气,顺便搜一下手机"))
五、多模型路由与故障转移
生产环境不能依赖单一模型厂商,需要实现自动路由和故障转移。
from tenacity import retry, stop_after_attempt, wait_exponential
import time
class MultiLLMRouter:
def __init__(self, config: LLMConfig):
self.config = config
self.clients = {
"openai": OpenAI(api_key=config.openai_key, base_url=config.openai_base),
"claude": anthropic.Anthropic(api_key=config.anthropic_key, base_url=config.anthropic_base),
}
self.fallback_order = ["openai", "claude"]
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def chat(self, prompt: str, preferred: str = "openai", stream: bool = False) -> str:
providers = [preferred] + [p for p in self.fallback_order if p != preferred]
for provider in providers:
try:
start = time.time()
if provider == "openai":
result = self._call_openai(prompt, stream)
elif provider == "claude":
result = self._call_claude(prompt, stream)
else:
continue
latency = time.time() - start
print(f"[路由] 使用 {provider},延迟 {latency:.2f}s")
return result
except Exception as e:
print(f"[路由] {provider} 调用失败: {e}")
continue
raise RuntimeError("所有模型提供商均不可用")
def _call_openai(self, prompt: str, stream: bool) -> str:
response = self.clients["openai"].chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
stream=stream,
timeout=self.config.timeout,
)
if stream:
return "".join(c.choices[0].delta.content or "" for c in response)
return response.choices[0].message.content
def _call_claude(self, prompt: str, stream: bool) -> str:
if stream:
with self.clients["claude"].messages.stream(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
messages=[{"role": "user", "content": prompt}],
) as s:
return "".join(s.text_stream)
response = self.clients["claude"].messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=4096,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
# 使用
router = MultiLLMRouter(config)
result = router.chat("用Python写一个快速排序", preferred="openai")
print(result)
六、结构化输出与数据校验
生产环境中,大模型的输出经常需要被下游系统消费,必须保证格式稳定。
6.1 OpenAI Structured Outputs
from pydantic import BaseModel
from typing import List
class TaskItem(BaseModel):
task_name: str
priority: str # high / medium / low
estimated_hours: int
class ProjectPlan(BaseModel):
project_name: str
tasks: List[TaskItem]
total_hours: int
risks: List[str]
def generate_project_plan(description: str) -> ProjectPlan:
response = client.beta.chat.completions.parse(
model="gpt-4o-2024-08-06",
messages=[
{"role": "system", "content": "你是一个项目管理专家,根据需求生成结构化的项目计划。"},
{"role": "user", "content": description},
],
response_format=ProjectPlan,
)
return response.choices[0].message.parsed
plan = generate_project_plan("开发一个电商订单管理系统,包含用户管理、商品管理、订单处理、支付对接四个模块")
print(plan.model_dump_json(indent=2, ensure_ascii=False))
6.2 手动JSON模式(兼容旧模型)
def generate_plan_json(description: str) -> dict:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "输出必须是合法的JSON,不要包含markdown代码块标记。"},
{"role": "user", "content": f"请为以下需求生成项目计划JSON:{description}"},
],
response_format={"type": "json_object"},
)
return json.loads(response.choices[0].message.content)
七、Token成本控制与用量监控
class TokenTracker:
def __init__(self):
self.usage_log = []
def log_usage(self, provider: str, model: str, prompt_tokens: int, completion_tokens: int):
cost_per_1k = {
"gpt-4o": {"input": 0.005, "output": 0.015},
"claude-3-5-sonnet": {"input": 0.003, "output": 0.015},
}
rates = cost_per_1k.get(model, {"input": 0.005, "output": 0.015})
cost = (prompt_tokens / 1000 * rates["input"] +
completion_tokens / 1000 * rates["output"])
self.usage_log.append({
"provider": provider,
"model": model,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"cost_usd": round(cost, 6),
"timestamp": time.time(),
})
return cost
# 集成到调用中
response = client.chat.completions.create(...)
tracker = TokenTracker()
cost = tracker.log_usage("openai", "gpt-4o", response.usage.prompt_tokens, response.usage.completion_tokens)
print(f"本次调用消耗: ${cost:.6f}")
八、常见问题 FAQ
Q1: 流式输出时如何处理网络中断?
A: 建议在前端实现自动重连机制,同时服务端记录已生成的Token,支持断点续传。SSE连接可设置retry字段。
Q2: Function Calling的延迟很高,怎么优化?
A: 1) 减少工具定义的描述长度;2) 合并多个小工具为一个大工具;3) 使用tool_choice="required"强制调用避免模型犹豫;4) 对确定性任务直接走代码逻辑,不走模型判断。
Q3: 如何防止Prompt注入攻击?
A: 1) 用户输入与系统指令分离;2) 对用户输入做长度和关键词过滤;3) 输出层增加内容安全审核API;4) 敏感操作(如删除、转账)必须人工确认。
Q4: 多模型路由时如何选择首选模型?
A: 可按任务类型路由:代码生成选Claude,结构化JSON选GPT-4o,简单问答选GPT-4o-mini。也可按实时成本和延迟动态调整。
Q5: API返回429(Rate Limit)怎么办?
A: 使用指数退避重试(tenacity库),并在服务端维护令牌桶限流器。不同厂商的限流策略不同,需要分别处理。
Q6: 如何确保模型输出100%符合JSON Schema?
A: OpenAI的response_format配合Pydantic模型是最可靠方案。对于不支持Structured Outputs的模型,可使用instructor库或后处理校验(生成失败时自动重试)。
Q7: 生产环境应该使用SDK还是直接调HTTP API?
A: 推荐使用官方SDK。SDK已内置重试、超时、流式处理、类型提示等机制,能显著减少开发工作量。只有在极端性能敏感或特殊协议场景下才考虑直接调HTTP。
九、总结
本文构建了一套完整的生产级大模型API开发体系:
- 基础调用:掌握OpenAI、Claude的认证与请求格式
- 流式输出:用SSE实现实时响应,提升用户体验
- Function Calling:让模型具备工具使用能力,扩展应用边界
- 多模型路由:实现故障转移与负载均衡,保障服务稳定性
- 结构化输出:通过Pydantic模型确保输出格式稳定可靠
- 成本监控:精细化追踪Token消耗,控制运营成本
将这些模块组合起来,你就拥有了一个企业级的大模型API基础设施,能够支撑从智能客服到自动化Agent的各类AI应用。
本文首发于 1630.top,转载请注明出处。
本文首发于 1630.top,转载请注明出处。