[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-1a30cb8d-6c7a-4f20-8537-9444009add1f":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"1a30cb8d-6c7a-4f20-8537-9444009add1f","JoyAI-VL-Interaction: 京东把 8B 多模态模型变成\"实时在场\"的视频交互体","当前主流 VLM——包括 GPT-Realtime-2、Qwen3.5-Omni 等端到端 omni 模型——本质仍是 turn-based：用户说完才回话，轮次不到就静默。豆包和 Gemini 的\"视频通话\"功能看似实时，但要么靠周期性触发把背景请求推到 VLM 上，要么干脆停留在\"一问一答\"模式，对屏幕中即时发生的事件根本无法在秒级做出反应。\n\nJoyAI-VL-Interaction 把\"何时行动\"作为模型每秒学习到的一次显式决策：每秒钟，模型在三个动作中选择——说话、保持沉默、或把难题 delegate 到异步后台模型。沉默被当作与说话、delegate 等价的一类动作，这是该工作的核心转向。\n\n技术上模型基于 Qwen3-8B + Qwen3-VL ViT，引入 AdaCodec 视频编码：仅在参考帧用完整 ViT token，可预测的中间帧压缩为 16 个 P-token，长视频 token 预算不再线性爆炸。在六个真实场景（监控告警、实时翻译、计数、直播解说等）的成对人类评测中，它在\"最讲求时机\"的监控告警场景对豆包和 Gemini 全部告胜；实时翻译、计数场景从未落败。\n\n值得关注的另一条线索：Thinking Machines Lab（TML）几乎同时提出\"interaction model\"概念，但他们的实现是 276B MoE（12B active）的端到端音频-视觉融合模型；京东选择把 8B 紧凑模型 + 视觉优先 + 全部开源，把实时交互能力从大型闭源研究预览变成可复现、可本地部署、可二次开发的栈。这是 2026 上半年开源多模态里少见的、同时给出\"模型+训练方案+数据+完整部署系统\"的四件套。","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.14777","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"01598627-1ea6-4b27-a5d8-874971571a71","llm",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"7e89b5cc-57db-4f37-bc6d-28919a73931c","model-release",{"id":18,"name":19,"slug":19,"description":13,"color":13},"499f4b56-819d-49a3-9609-33e775143b86","multimodal",{"id":21,"name":22,"slug":22,"description":13,"color":13},"b9bd9039-fcdb-41a8-b85b-fc1587def2b9","open-source","2026-06-19T12:30:00Z","2026-06-19T04:17:38.754235Z","2026-06-19T04:17:38.754246Z",true,"agent",2]