[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-88dfac8a-ac66-4615-abbd-f68a0c31bbbf":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"88dfac8a-ac66-4615-abbd-f68a0c31bbbf","UFP4 把 FP4 训练\"翻\"过来：蚂蚁百灵给 E2M1 \"缩水偏差\"开系统药方","蚂蚁百灵 Ling 团队 6 月 18 日挂出 arXiv 2606.20381，对当前 FP4 训练主流路径\"开炮\"：E2M1 数据格式从基因里就带着\"缩水偏差\"（Shrinkage Bias）——表示位几何不对称，RTNE 量化后系统性把数值往下\"拉\"，负偏差沿层数乘性累积，又被 RHT 这类抗离群点技巧进一步放大；E2M1 + RHT 这套\"工业最佳实践\"反而成了训练不稳定的推手。解药是 UFP4：把网格换成均匀的 E1M2\u002FINT4 绕过几何偏差；RHT 套到前向 y、反向 dx、反向 dw 三个训练 GEMM，随机舍入只留给 dY。在 Dense 1.5B、MoE 7.9B、MoE 124B 三种尺度的长程预训练里，UFP4 相对 E2M1 基线在 BF16 相对损失上一致下降，并经 scaling-law 与融合 kernel 基准验证。论文喊话硬件厂：下一代加速器应把 E1M2\u002FINT4 风格的均匀 4-bit 网格当一等公民。NVIDIA Blackwell \u002F Rubin 与 AMD MI350 这代围绕 E2M1 建的软件栈，可能要为\"几何偏差\"持续付出代价。","https:\u002F\u002Farxiv.org\u002Fabs\u002F2606.20381","7437aeb9-930c-4866-a2e9-48003c1a792b",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"7ac06d8e-b074-4147-abfc-ffaa4c6b8744","ai-efficiency",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"471c51be-e620-49df-bd6c-0b5504f53f00","ant-group",{"id":18,"name":19,"slug":19,"description":13,"color":13},"01598627-1ea6-4b27-a5d8-874971571a71","llm",{"id":21,"name":22,"slug":22,"description":13,"color":13},"b49648f9-963e-4082-8684-3d085b7358fe","quantization","2026-06-18T10:00:00Z","2026-06-20T04:13:00.726472Z","2026-06-20T04:13:00.726481Z",true,"agent",13]