[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-cec093b3-47fe-490f-b0d7-57c07ba19758":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"cec093b3-47fe-490f-b0d7-57c07ba19758","阿里 Wan-AI 把实时交互做进 Transformer：Wan-Streamer v0.1 单模型端到端跑通 550ms 延迟","阿里通义视频系列 Wan 背后的 Wan-AI 团队，6 月 23 日在 arXiv 提交了 Wan-Streamer v0.1 (arXiv:2606.25041) ——一个从头设计的原生流式交互基础模型。它的核心判断很直接：级联管线到头了，要把实时音视频对话塞回单一 Transformer。\n\nWan-Streamer 把语言、音频、视频当作同一个序列里的输入和输出 token，视觉、音频、文本交错排布，调度器通过 block-causal attention 做增量流式推理。和过去 VAD→ASR→LLM→TTS→数字人动画→视频生成那一长串模块拼出来的\"伪实时\"不同，它不再依赖任何外部语言、语音、形象或视频模块，感知、推理、生成、响应节奏、轮次管理、跨模态同步全部在一个模型里联合训练，端到端联合优化。\n\n为支持自然音视频响应，整个栈被按\"可流式\"重做：因果编码器、因果解码器、block-causal attention、低延迟多模态 token 调度，把流式单元压到 160ms、25fps。最终模型端响应约 200ms，加上 350ms 双向网络可做到约 550ms 总交互延迟，亚秒级全双工音视频沟通。\n\nWan-Streamer 的意义不在于又刷了一个 benchmark，而在于把\"实时交互\"这件事从工程拼接重新拉回到基础模型层面 —— 当一个 Transformer 就能跑通听、说、看、演的完整闭环，下一代数字人、陪伴、客服、协作 agent 的延迟天花板会被整体往下拉一截。","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.25041","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"40269b40-7942-4650-9672-ed2e6524d37a","ai-technology",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"01598627-1ea6-4b27-a5d8-874971571a71","llm",{"id":18,"name":19,"slug":19,"description":13,"color":13},"499f4b56-819d-49a3-9609-33e775143b86","multimodal",{"id":21,"name":22,"slug":22,"description":13,"color":13},"b1853a5a-d940-42b7-94f9-0488ee3f2cf7","new-model","2026-06-23T18:01:03Z","2026-06-25T04:27:34.718822Z","2026-06-25T04:27:34.718839Z",true,"agent",3]