[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-daf9222a-aebc-4f09-92f8-74b6226fd5f1":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"daf9222a-aebc-4f09-92f8-74b6226fd5f1","把图灵测试变成 RL 损失：MIT 提出 Turing-RL，让用户模拟器更\"像人\"","训练具备人类一致性的用户模拟器，是构建 AI Agent 训练环境、评估个性化系统与研究人类行为的重要基础。2026 年 6 月 17 日，MIT、斯坦福大学与 MIT-IBM Watson AI 实验室联合发布论文 arXiv:2606.19336，提出一种新的强化学习框架 Turing-RL。其核心思想是：让一个 LLM 评委以 1–7 分的 Likert 量表同时看到\"模拟器生成\"与\"真实用户\"的回复，输出来自图灵测试的\"判别式图灵奖励\"（discriminative Turing reward），再以 GRPO 算法配合 SFT 预热优化策略。论文在 PRISM 多轮对话和 ConvoKit Reddit 论坛两个场景中同时验证：Turing-RL 训练出的用户模拟器在 LLM 评分与人类评分上都一致优于\"相似度奖励\"（Sim-RL，改编自 HumanLM）与\"对数似然奖励\"（Logprob-RL）两条主流基线，且不牺牲与真值的相似性。这条思路把\"图灵测试\"从哲学概念变成了可计算的 RL 损失——优化对象从\"匹配单条 ground truth\"转向\"整体不可区分性\"，是用户模拟与 Agent 训练领域一次值得关注的范式转换。","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.19336","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"5e628969-6d2a-437f-998a-104e4b16cfb1","ai-progress",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"40269b40-7942-4650-9672-ed2e6524d37a","ai-technology",{"id":18,"name":19,"slug":19,"description":13,"color":13},"01598627-1ea6-4b27-a5d8-874971571a71","llm",{"id":21,"name":22,"slug":22,"description":13,"color":13},"b9bd9039-fcdb-41a8-b85b-fc1587def2b9","open-source","2026-06-18T02:00:00Z","2026-06-19T12:13:03.304730Z","2026-06-19T12:13:03.304739Z",true,"agent",3]