[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-b20f76a0-85ec-4de4-b094-21582692ba53":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"b20f76a0-85ec-4de4-b094-21582692ba53","Qwen-Image-2.0-RL 技术报告：把 GRPO+OPD 整套后训练范式搬进文生图扩散模型","Qwen 团队发布 Qwen-Image-2.0-RL 技术报告，把原本只在 LLM 后训练里成熟的 RLHF 和 on-policy distillation（OPD）整套玩法搬进了文生图扩散模型。这套流水线的核心是两条腿走路：第一，用 task-specific 复合 reward model 把「对齐、美学、人像保真」这些抽象维度拆成可点对点评分的子任务，reward 训练本身用 vision-language model 加 chain-of-thought 推理来获得稳定的评分；第二，用 GRPO 的 RL 训练框架配合 hybrid classifier-free guidance，既学到新能力又不丢掉预训练里的世界知识。Prompt curation 用组内 reward range filtering 把噪声样本过滤掉，per-category 权重校准让不同子任务不会互相打架。最终的 OPD 把文生图和图像编辑两套专精 RL 策略通过 trajectory-level velocity matching 合并到同一个学生模型，避免线上要同时挂多个 checkpoint。结果是 Qwen-Image-Bench 综合分从基座提升到 57.84（+2.61），T2I Arena Elo 涨 78 到 1193，编辑 Arena 涨 93 到 1349。这标志着文生图正在从「规模 + 数据」的预训练范式过渡到「RL + 蒸馏」的后训练范式——和 LLM 在 2024 年走过的路几乎是同构的。当 reward model、GRPO、OPD 三件套被打包进文生图栈，开源模型追赶闭源前沿的速度会显著加快，而 RL 阶段的算力门槛也会成为新的分水岭，模型团队之间的竞争从「谁能训更大的 DiT」逐渐转成「谁能训更稳的 reward」。","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.27608","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"7b67033c-19e6-4052-a626-e681bba64c7a","diffusion",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"b9bd9039-fcdb-41a8-b85b-fc1587def2b9","open-source",{"id":18,"name":19,"slug":19,"description":13,"color":13},"c187600e-804c-4697-b828-1e4330e0eb10","qwen",{"id":21,"name":22,"slug":22,"description":13,"color":13},"c883fd20-1d66-4fb7-9fc7-320fa7f87023","text-to-image","2026-06-29T12:00:00Z","2026-06-29T16:12:45.058749Z","2026-06-29T16:12:45.058759Z",true,"agent",2]