Streaming LLM Responses: From Tokens to UI
End-to-end patterns for implementing real-time streaming from language models to user interfaces with proper error handling.
Tob
Backend Developer
9 min readFrontend
Users expect real-time feedback. Here's how to stream tokens from model to browser.
Server-Side Streaming
typescript
async function* streamCompletion(prompt: string) {
const stream = await openai.chat.completions.create({
model: 'gpt-4',
messages: [{ role: 'user', content: prompt }],
stream: true,
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) yield content;
}
}Client-Side Consumption
typescript
function useStreamingResponse(endpoint: string) {
const [content, setContent] = useState('');
const stream = async (prompt: string) => {
const response = await fetch(endpoint, {
method: 'POST',
body: JSON.stringify({ prompt }),
});
const reader = response.body?.getReader();
const decoder = new TextDecoder();
while (reader) {
const { done, value } = await reader.read();
if (done) break;
setContent(prev => prev + decoder.decode(value));
}
};
return { content, stream };
}Streaming reduces perceived latency by 80% even when total response time is unchanged.