手撸AI对话助手:带思考过程的完整实现指南
此前那篇《用 LangChain 驱动本地 Ollama 模型》介绍了如何借助 LangChain 实现对话,但大模型响应延迟始终是痛点——用户在等待中容易失去耐心。优化体验的直效方案就是流式输出。
流式输出:逐词响应提升交互感
效果类似打字机逐字打印,按 token 顺序逐步呈现内容。用户在内容逐步生成的过程中能获得即时反馈,显著降低等待焦虑。
安装依赖
pip install -U uvicorn "fastapi[standard]" "langchain[openai]"
调用流式输出核心方法
关键在于 stream 与 astream 两个方法。以下示例基于 FastAPI 构建接口:后端收到请求后,通过 llm.astream 逐个推送 token,前端实时渲染。代码中已配置跨域支持,便于本地开发调试。
import json
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from langchain_openai import ChatOpenAI
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/api/bot/chat")
async def bot_chat(request: dict):
query = request.get("query", "你好")
llm = ChatOpenAI(
model="qwen3.5:35b",
base_url="http://192.168.31.24:4000",
api_key="your api key",
temperature=0.7,
streaming=True,
)
system_prompt = (
"你是一个会展示思考过程的AI。"
)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": query},
]
async def generate():
# 直接用 LLM astream,逐 token 流出
async for chunk in llm.astream(messages):
content = chunk.content or ""
if not content:
continue
yield json.dumps({
"type": "chunk",
"content": content,
}, ensure_ascii=False) + "n"
yield json.dumps({"type": "done"}) + "n"
return StreamingResponse(
generate(),
media_type="application/x-ndjson",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
验证测试
使用 curl 发送请求观察效果:
curl --location --request POST 'http://127.0.0.1:8000/api/bot/chat'
--header 'Accept: application/json'
--header 'Content-Type: application/json'
--data-raw '{
"query": "请一步步思考:2+3等于多少?"
}'
思考过程可视化:让推理链路透明
仅流式输出仍不够,用户还希望看到模型的推理步骤。部分模型内置 reasoning 参数,但兼容性有限。更稳妥的做法是通过提示词让模型自行标注输出内容,前端根据标签区分渲染。
优化提示词策略
调整系统提示词为:
#... 其它不变
system_prompt = (
"你是一个会展示思考过程的AI。n"
"请先输出你的思考过程(用标签包裹),"
"然后再输出最终答案(用标签包裹)。nn"
"示例:n"
"这里是推理过程 n"
"这里是最终答案 "
)
#... 其它不变
无论底层模型是否原生支持推理,只要遵循标签格式,前端即可统一处理。
前端页面实现
以下完整前端示例通过 fetch 流式读取后端 NDJSON 数据,边接收边渲染。思考过程区域默认折叠,用户点击展开;最终答案持续显示。代码中解决了流式场景下标签被 token 截断的典型问题——当收到残缺标签(如 只截取一半)时,利用 stripPartial 函数剥离残留标签片段,确保显示内容干净。这是流式渲染必须处理的边界细节。
MiMo Chat
<script>
let isSending = false;
async function sendMessage() {
if (isSending) return;
const query = document.getElementById('queryInput').value.trim();
if (!query) return;
isSending = true;
document.getElementById('sendBtn').disabled = true;
const msgId = crypto.randomUUID().replace(/-/g, '');
const conversationId = localStorage.getItem('convId') || crypto.randomUUID().replace(/-/g, '');
localStorage.setItem('convId', conversationId);
const msgEl = createMessage(msgId);
document.getElementById('messages').prepend(msgEl);
const state = {
raw: '',
lastThinkLen: 0,
lastAnswerLen: 0,
thinkCollapsed: false,
};
try {
const resp = await fetch('http://127.0.0.1:8000/api/bot/chat', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({
msgId, conversationId, query,
isEditedQuery: false,
modelConfig: {enableThinking: true, webSearchStatus: "disabled", model: ""},
multiMedias: [],
}),
});
const reader = resp.body.getReader();
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const {done, value} = await reader.read();
if (done) break;
buffer += decoder.decode(value, {stream: true});
const lines = buffer.split('n');
buffer = lines.pop();
for (const line of lines) {
if (!line.trim()) continue;
let data;
try {
data = JSON.parse(line);
} catch (e) {
continue;
}
if (data.type === 'chunk') {
state.raw += data.content;
render(msgEl, state);
} else if (data.type === 'done') {
render(msgEl, state);
removeCursor(msgEl);
}
}
}
if (buffer.trim()) {
try {
const d = JSON.parse(buffer);
if (d.type === 'chunk') state.raw += d.content;
} catch (e) {
}
}
render(msgEl, state);
removeCursor(msgEl);
} catch (err) {
msgEl.querySelector('.answer-content').textContent = '错误: ' + err.message;
} finally {
isSending = false;
document.getElementById('sendBtn').disabled = false;
}
}
/**
* 当标签被 chunk 切断时(如 ),正则的 |$ 兜底
* 会把部分标签(2+3等于5') = false ← 完整标签还没到
* stripPartial(thinkText, '2+3等于5"
* thinkMatch[1] = "2+3等于5" ← 正则精确匹配,无残留
* raw.includes('') = true
* → 不需要剥
*/
function render(msgEl, state) {
const raw = state.raw;
const thinkMatch = raw.match(/([sS]*?)(?: |$)/);
const answerMatch = raw.match(/([sS]*?)(?: |$)/);
let thinkText = thinkMatch ? thinkMatch[1] : '';
let answerText = answerMatch ? answerMatch[1] : '';
// 标签未完整时,剥掉末尾的残留片段
if (!raw.includes('')) {
thinkText = stripPartial(thinkText, '');
}
if (!raw.includes('')) {
answerText = stripPartial(answerText, '');
}
// Think 区域
if (thinkText.length > 0) {
const section = msgEl.querySelector('.think-section');
const content = msgEl.querySelector('.think-content');
section.classList.add('visible');
if (thinkText.length > state.lastThinkLen) {
const delta = thinkText.substring(state.lastThinkLen);
content.appendChild(document.createTextNode(delta));
content.scrollTop = content.scrollHeight;
state.lastThinkLen = thinkText.length;
}
if (raw.includes('') && !state.thinkCollapsed) {
state.thinkCollapsed = true;
const dots = section.querySelector('.loading-dots');
if (dots) dots.style.display = 'none';
section.querySelector('.think-arrow').classList.add('collapsed');
content.classList.add('collapsed');
}
}
// Answer 区域
if (answerText.length > 0) {
const content = msgEl.querySelector('.answer-content');
if (answerText.length > state.lastAnswerLen) {
const delta = answerText.substring(state.lastAnswerLen);
const oldCursor = content.querySelector('.cursor');
if (oldCursor) oldCursor.remove();
content.appendChild(document.createTextNode(delta));
const cursor = document.createElement('span');
cursor.className = 'cursor';
content.appendChild(cursor);
state.lastAnswerLen = answerText.length;
}
}
}
/**
* 从文本末尾剥掉部分标签
* 比如 stripPartial("2+3等于5
AI 助手

