[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"news-bac63469-b19c-4619-8709-73656ad0cd9f":3},{"id":4,"title":5,"summary":6,"original_url":7,"source_id":8,"tags":9,"published_at":23,"created_at":24,"modified_at":25,"is_published":26,"publish_type":27,"image_url":13,"view_count":28},"bac63469-b19c-4619-8709-73656ad0cd9f","Intel × HF 上线 xpu-kernels Skill：LLM Agent 把 vLLM 调过的 Triton 内核再提 2.8×","Intel 与 Hugging Face 联合发布 xpu-kernels Agent Skill，把 Intel Labs 的 Xe-Forge 框架封装成 Coding Agent 可调用的「技能」，专攻 Intel Arc Pro B70 等 Xe2 GPU。核心是 CoVeR 循环：LLM 当规划器，最多跑九轮候选，每轮在真硬件上做正确性校验与基准测试，错了回退到最优分支；并配一份 XPU 专属知识库（tensor descriptor、GRF mode 256、tile swizzling 等）补上 LLM 训练语料里欠采样的细节。结果：在 Arc Pro B70 上相对 PyTorch eager 在 100 个 KernelBench Level-2 拿到 1.26× geomean 加速（胜率 69%）；更硬核的是，在 vLLM 已被工程师手工调过的 24 组生产配置（BatchedMoE \u002F FusedMoE \u002F UnifiedAttention，覆盖 Gemma2\u002F3-27B、gpt-oss 20B、Llama3.3-70B、Qwen3）上又榨出 2.8× geomean，Qwen3-30B-A3B-Instruct decode 提升高达 35×，Flash Attention 长序列下 13.3×。代码侧由 kernel-builder CLI 编译后上传 HF Kernel Hub，下游 get_kernel() 一行加载。这条路径首次在非 NVIDIA 加速器上击败资深工程师，对国产 GPU\u002FTPU 生态是值得复制的样板。","https:\u002F\u002Fhuggingface.co\u002Fblog\u002Fdanf\u002Fintel-xpu-kernels-skill","24d5c6c5-6573-4180-a1fd-f1459842d1af",[10,14,17,20],{"id":11,"name":12,"slug":12,"description":13,"color":13},"7ac06d8e-b074-4147-abfc-ffaa4c6b8744","ai-efficiency",null,{"id":15,"name":16,"slug":16,"description":13,"color":13},"40269b40-7942-4650-9672-ed2e6524d37a","ai-technology",{"id":18,"name":19,"slug":19,"description":13,"color":13},"e0d31e94-ce47-4c8f-831c-d3d2926d42f3","hardware",{"id":21,"name":22,"slug":22,"description":13,"color":13},"0a93ec8e-ea39-4693-81de-563ca8c173f7","inference","2026-06-20T00:16:00Z","2026-06-20T00:17:45.981058Z","2026-06-20T00:17:45.981068Z",true,"agent",3]