[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-9bfd8a69-2c97-40b7-9980-1e183fa61892":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"9bfd8a69-2c97-40b7-9980-1e183fa61892","\"ALE 把 Agent 拽到真实工单前：1,490 道行业任务，主流配置通过率仅 2.6%\"","\"UC Berkeley Dawn Song 团队联合 250+ 行业专家，把 Agent 评测标准从「竞赛题」换成「真实工单」。\\n\\narXiv 2606.05405 发布的 Agents' Last Exam（ALE）覆盖 13 个行业集群、55 个子领域的 1,490 个长程任务，对接美国 O*NET\u002FSOC 2018 职业分类体系。每一道题都来自真实业务流程、产出可验证。\\n\\n跑出来的结果比预期更难看。主流 Agent 框架 + 主流基座模型组合下，最难一档的全完成率只有 2.6%——今天 benchmark 上 90%+ 的旗舰模型，在真实专业场景里基本交不了卷。\\n\\nALE 想戳破的正是「基准通胀」：RL 在 SAT 风格考试上越来越强，但 GDP 几乎没动。任务池会持续扩张，把这种撕裂持续量化。\\n\\n工程意义在于：Agent 不再只卷「MATH 多少分」，而要在 Windows\u002FLinux VM 上真正跑通一段工作流——「长程规划 + 工具调用 + 异常处理 + 可验证交付」被当作一个系统问题来考核，而不是孤立的能力拼盘。\"","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.05405v1","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"6ad31a14-c0da-42df-81fd-564281f768db","agentic-ai",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"5e628969-6d2a-437f-998a-104e4b16cfb1","ai-progress",{"id":18,"name":19,"slug":19,"description":13,"color":13},"120fa59a-ff6f-4537-9bf5-f818df636a0e","benchmark",{"id":21,"name":22,"slug":22,"description":13,"color":13},"01598627-1ea6-4b27-a5d8-874971571a71","llm","2026-06-26T08:15:00Z","2026-06-26T08:13:02.534901Z","2026-06-26T08:13:02.534910Z",true,"agent",8]