<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" style="font-size:16px;"><head></head><head><meta charset="utf-8"/><!--[if !mso]><!--><meta http-equiv="X-UA-Compatible" content="IE=edge"/><!--<![endif]--><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="x-apple-disable-message-reformatting"/><meta name="format-detection" content="telephone=no,address=no,email=no,date=no,url=no"/><meta name="color-scheme" content="light"/><meta name="supported-color-schemes" content="light"/><title>Multi-Token Attention </title><!--[if mso]><xml><o:OfficeDocumentSettings><o:AllowPNG/><o:PixelsPerInch>96</o:PixelsPerInch></o:OfficeDocumentSettings></xml><![endif]--><style> :root { color-scheme: light; supported-color-schemes: light; } body { margin: 0; padding: 0; min-width: 100%!important; -ms-text-size-adjust: 100% !important; -webkit-transform: scale(1) !important; -webkit-text-size-adjust: 100% !important; -webkit-font-smoothing: antialiased !important; } .body { word-wrap: normal; word-spacing:normal; } table.mso { width: 100%; border-collapse: collapse; padding: 0; table-layout: fixed; } img { border: 0; outline: none; } table { mso-table-lspace: 0px; mso-table-rspace: 0px; } td, a, span { mso-line-height-rule: exactly; } #root [x-apple-data-detectors=true], a[x-apple-data-detectors=true], #MessageViewBody a { color: inherit !important; text-decoration: inherit !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important; } span.MsoHyperlink { color: inherit !important; mso-style-priority: 99 !important; } span.MsoHyperlinkFollowed { color: inherit !important; mso-style-priority: 99 !important; } .a { background-color:#dedede; } .b { background-color:#2a2a2a; } .c { background-color:#ffffff; } .d { background-color:#fff0c8; } .d2 { background-color:#FFFFFF; } .d3 { background-color:#FFFFFF; } h1 a { text-decoration:none;color:#2A2A2A !important; } h2 a { text-decoration:none;color:#2A2A2A !important; } h3 a { text-decoration:none;color:#2A2A2A !important; } h4 a { text-decoration:none;color:#2A2A2A !important; } h5 a { text-decoration:none;color:#2A2A2A !important; } h6 a { text-decoration:none;color:#2A2A2A !important; } h1, h1 a, h2, h2 a, h3, h3 a, h4, h4 a, h5, h5 a, h6, h6 a, ul, li, ol, p, p a { margin: 0;padding: 0; } h1 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:28px;color:#2A2A2A;line-height:42px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h2 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:24px;color:#2A2A2A;line-height:36px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h3 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:20px;color:#2A2A2A;line-height:30px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h4 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:18px;color:#2A2A2A;line-height:27px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h5 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:16px;color:#2A2A2A;line-height:24px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h6 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:14px;color:#2A2A2A;line-height:21px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } p { font-family:'Georgia','Times New Roman',serif;font-weight:400;color:#2D2D2D;font-size:16px;line-height:24px;padding-bottom:8px;padding-top:8px;mso-margin-top-alt:8px;mso-margin-bottom-alt:8px; } p a, .e a, ul a, li a, .h a, .h2 a, .h3 a { word-break:break-word;color:#2C81E5 !important;text-decoration:none;font-style:italic; } p a span, .e a span, ul a span, li a span { color: inherit } p .bold { font-weight:bold;color:#2D2D2D; } p span[style*="font-size"] { line-height: 1.6; } .f p { font-size:12px;line-height:15px;color:#2D2D2D;padding:0; } .f p a { color:#2D2D2D !important; } .g p { font-family:'Helvetica',Arial,sans-serif;font-size:14px;line-height:20px;font-weight:normal;margin:0; } .g p a { text-decoration: underline; } .i p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i p a { color:#2D2D2D !important; } .i2 p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i2 p a { color:#2D2D2D !important; } .i3 p { font-family:'Helvetica',Arial,sans-serif;line-height:43px;font-size:24px;color:#2D2D2D; } .i3 p a { color:#2D2D2D !important; } .h p a { color:#595959 !important; } .h2 p a { color:#595959 !important; } .h3 p a { color:#595959 !important; } .f p a, .i p a, .i2 p a, .i3 p a, .h p a, .h2 p a, .h3 p a { text-decoration:underline; } .j { border-top:3px solid #ffeb2d; } .k p { padding-left:15px;padding-bottom:0px;padding-top:6px;mso-margin-top-alt:6px;mso-margin-bottom-alt:0px;mso-margin-left-alt:15px; } .o { background-color:#FFFFFF;border:1px solid #F1F1F1;border-radius:5px; } .o p { font-family:'Helvetica',Arial,sans-serif;padding:0px;margin:0px; } .l p, .l p a { font-size:14px;line-height:20px;font-weight: bold;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .m p, .m p a { font-size:13px;line-height:18px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .n p, .n p a { font-size:12px;line-height:17px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .p { background-color:#FFFFFF;max-width:520px;border:1px solid #E1E8ED;border:1px solid rgba(80, 80, 80, 0.3);border-radius:5px; } .q { font-size:16px;font-family:Helvetica,Roboto,Calibri,sans-serif !important;border:1px solid #e1e8ed;border:1px solid rgba(80, 80, 80, 0.3);border-radius:10px;background-color:#FFFFFF; } .q p { font-size:16px;font-family:system-ui,Helvetica,Roboto,Calibri,sans-serif !important;color:#222222;padding:4px 0; } .r { border:1px solid #E1E8ED !important;border-radius:5px; } .s p { font-size: 14px; line-height: 17px; font-weight: 400; color: #697882; text-decoration: none; } .t p { font-family:'Helvetica',Arial,sans-serif;font-size:12px;line-height:18px;font-weight:400;color:#000000;font-style:italic;padding:4px 0px 0px;} .v { border-radius:10px;border:solid 0px #DFD150;background-color:#2C81E5;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;color:#FFFFFF; } .v a { text-decoration:none;display:block;color:#FFFFFF; } .w p { font-size:12px;line-height:15px;font-weight:400;color:#FFFFFF; } .w p a { text-decoration: underline !important;color:#FFFFFF !important; } ul { font-family:'Helvetica',Arial,sans-serif;margin:0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:disc;font-size:16px; } ul > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:disc; } ol { font-family:'Helvetica',Arial,sans-serif;margin: 0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:decimal;font-size:16px; } ol > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:decimal; } .e h3, .e p, .e span { padding-bottom:0px;padding-top:0px;mso-margin-top-alt:0px;mso-margin-bottom-alt:0px; } .e span, .e li { font-family:'Helvetica',Arial,sans-serif;font-size:16px;color:#2D2D2D;line-height:24px; } .rec { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji" !important; } .rec__button:hover { background-color: #f9fafb !important; } .copyright a {color: inherit !important; text-decoration: none !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important;} .txt_social p { padding: 0; word-break: break-all; } .table, .table-c, .table-h { border: 1px solid #C0C0C0; } .table-c { padding:5px; background-color:#FFFFFF; } .table-c p { color: #2D2D2D; font-family:'Helvetica',Arial,sans-serif !important;overflow-wrap: break-word; } .table-h { padding:5px; background-color:#F1F1F1; } .table-h p { color: #2A2A2A; font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif !important;overflow-wrap: break-word; } @media only screen and (max-width:667px) { .aa { width: 100% !important; } .bb img { width: 100% !important; height: auto !important; max-width: none !important; } .cc { padding: 0px 8px !important; } .ee { padding-top:10px !important;padding-bottom:10px !important; } .ff ul, .ff ol { margin: 0px 0px 0px 10px !important;padding: 0px !important; } .ff li { margin:10px 0px 0px 10px !important; } .r {height:140px !important;} .s p { font-size:13px !important;line-height:15px !important; } .mob-hide {display:none !important;} .mob-stack {display:block !important;width:100% !important;} .mob-w-full {width:100% !important;} .mob-block {display:block !important;} .embed-img {padding:0px 0px 12px 0px !important;} .socialShare {padding-top:15px !important;} .rec { padding-left:15px!important;padding-right:15px!important; } .bodyWrapper { padding:7px 4px 7px 4px !important; } .social-mobile {float:left !important;margin-top:10px !important;} } @media screen and (max-width: 480px) { u + .a .gg { width: 100% !important; width: 100vw !important; } .tok-heart { padding-top:75% !important; } .tok-play { padding-top: 250px !important; } } @media screen and (max-width: 320px) { .tok-heart { padding-top:65% !important; } } .u { border: 1px solid #CACACA !important; border-radius: 2px !important; background-color: #ffffff !important; padding: 0px 13px 0px 13px !important; font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif !important;font-size: 12px !important; color: #767676 !important; } .u a { text-decoration: none; display: block !important; color: #767676 !important; margin: 0px !important; } .u span, .u img { color: #767676 !important;margin:0px !important; max-height:32px !important;background-color:#ffffff !important; } </style><!--[if mso]><style type="text/css"> sup { font-size: 100% !important;vertical-align: .5em !important;mso-text-raise: -1.5% !important;line-height: 0 !important; } ul { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ul li { margin-left: 0px !important; mso-special-format: decimal; } ol { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ol li { margin-left: 0px !important; mso-special-format: decimal; } li.listItem { margin-left:15px !important; margin-top:0px !important; } .paddingDesktop { padding: 10px 0 !important; } .edm_outlooklist { margin-left: -20px !important; } .embedImage { display:none !important; } </style><![endif]--><style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-display: swap; src: url('https://fonts.gstatic.com/s/opensans/v40/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4gaVIUwaEQbjA.woff2') format('woff2'); } @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 700; font-display: swap; src: url('https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@1,700&display=swap') format('woff2'); } </style></head><body class="a" style="margin:0px auto;padding:0px;word-wrap:normal;word-spacing:normal;background-color:#dedede;"><div role="article" aria-roledescription="email" aria-label="email_name" lang="en" style="font-size:1rem"><div style="display:none;max-height:0px;overflow:hidden;"> Plus more about Inference-Time Scaling for Generalist Reward Modeling and Why do LLMs attend to the first token?  ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ </div><table role="none" width="100%" border="0" cellspacing="0" align="center" cellpadding="0" class="gg"><tr><td align="center" valign="top"><table role="none" width="670" border="0" cellspacing="0" cellpadding="0" class="aa" style="width:670px;table-layout:fixed;"><tr><td class="bodyWrapper" align="center" valign="top" style="padding:7px 7px 7px 7px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="border-width:0px 0px 0px 0px;border-style: solid; border-color: #2a2a2a;border-radius:10px 10px 0px 0px;background-color:#ffffff;" class="c"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr id="header"><td style="padding:28px 28px 0 28px;"><div style="padding-top:0px;padding-right:0px;padding-bottom:20px;padding-left:0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td class="f" align="right" valign="top"><p> April 08, 2025 | <a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxSdB5RCIH6yy1Fm1CYma3EzXJhxjr0P4KnjB1cwYihusQCetHV2npB7BJ_bi4CqDeCIs5zTQFg9mwUfzVwbj0STx1b_kZGHHjll3dauVoNN0Yp-xzy41pse2F_nqCz7CAPrBrq_5eKrq-dzHWikmSNgzcU5joWCi5OMd137zTMP98scw0QDXHyqRyDnkQVSzuFED6wmLG2azns6w8frwOp2bLcP_0B97t0SlxCNXaK7YCPCOIYPXhMfz6QamNSYaeGaEAKGiDo9esBiUPVhKsB-JKJe7M5yh8DvawGZ1Jj7ND3yQRF-Wuafdw_naCFsCRphFCOEM_yYOGggwwyrGzoj-v34LMwwycq4jS8I0FoMvjHf2Z1nTZZiaVIdXUYuw8N3Xm0lLU38WDODyfWXM_KilFCBipZvZGn_W2sEqKOX5DogErZawy8sctLCiKXI92U4jjqAAA3R_B3waa7C5mL_AWqs3rYGIHMt6149cyZNMkog7dvbrBoSbVXx7OdpE4NE2wErAr29JEHQY8ROKHcvtFL6VXfsAwXVL8f_HAH0yg2uRJy69VVg7Kf5fu99UeBZ0dujfcsH09Bw6oZ58pnSHSBASTx6quuPWyAmz6dXZbHF44NJyVSC3wote_PnxFA/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h0/h001.3hRFOsyYcV2h1j0Q717bD8m_nssXfb5yFW-Cfotf66A"><span class="translation_missing" title="translation missing: en.templates.posts.email.v3.header.read_online">Read Online</span></a></p></td></tr><tr><td class="dd" align="center" valign="top" style="padding:15px 0;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><h1 style="text-align:left;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;font-weight:Bold;font-size:32px;color:#2A2A2A;padding:2px 0;line-height:38px;"> Multi-Token Attention </h1><p style="text-align:left;font-family:'Helvetica',Arial,sans-serif;font-weight:normal;font-size:20px;color:#3E3E3E;padding:5px 0;line-height:24px;"> Plus more about Inference-Time Scaling for Generalist Reward Modeling and Why do LLMs attend to the first token? </p></td></tr></table></td></tr><tr><td style="height:0px;width:0px;"><div style="height:1px;" data-open-tracking="true"> <img src="https://elink4f7.mail.bycloud.ai/ss/o/u001.3wmUuY8gEWd4_869a_eXcg/4fh/-N3GhbeyQjqJYdVRbqj4NQ/ho.gif" alt="" width="1" height="1" border="0" style="height:1px !important;width:1px !important;border-width:0 !important;margin-top:0 !important;margin-bottom:0 !important;margin-right:0 !important;margin-left:0 !important;padding-top:0 !important;padding-bottom:0 !important;padding-right:0 !important;padding-left:0 !important;"/> </div></td></tr></table></div></td></tr><tr id="content-blocks"><td class="email-card-body" align="center" valign="top" style="padding-bottom:28px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td id="nov-18-th-nov-24-th-33-latest-ai-re" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h6 style="color:#2A2A2A;font-weight:normal;"><i>Mar 31th ~ Apr 6th</i><br><i>#50 Latest AI Research Explained Simply</i></h6></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="industry-news-in-1-line" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">🗞️ Industry News in 1 Line</h2></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 12k </span></span> <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoUI6Y4Gv7dIw3HkBvIz00e8th4mEuBIY9yk0e8Zy4RKUZw3ryxiIfWTIaawhgGiA10RND5M4jdL9MaZo4SJr2Zchbou0EPlS0YMSdXWbmoaHjGctRLuq-pMPzt4P77j60L9aTP8lqIT9ZC2aHZ-NwFY/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h1/h001.ZP9faIch-bwBubnuvXSbswe5wixfcjjO5KecBOdbyuI" target="_blank" rel="noopener noreferrer nofollow"><span>Meta has released the Llama-4 series</span></a> <i>on a SATURDAY</i>. They are a MoE multi-modal model family that is capable of language and visual understanding. Scout and Maverick are open weights and <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j21RPoKpth6LqlJ1b93IGxleFYBQLMaKDlMTpDKq-HFi0Yg8FGuDirFVeKeus1sMDDwb_xN1zxIDiAkOKk9GaON5YQOrRtXEmPEv8VCFQJw9Q/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h2/h001.o8Vk4hj4R6LD9JYyGUmBuDtZ_pLJCtakNzkfM_yohQ4" target="_blank" rel="noopener noreferrer nofollow"><span>now available for download</span></a>. The highlight of this release is Scout is able to process up to 10M context window, while Maverick is capable of 1M. However, the community has been disappointed with its performance. A bycloud video is coming. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:510px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/e6ab8e47-3a4d-4afb-b982-34f7eaeb6cc9/Gnyz0XFbYAEznnT.jpg?t=1744144546" alt="" height="auto" width="510" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:510px;"><p>Llama-4 model specs</p></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 3k </span></span> <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoc_DCKgi0ZToYGgu3OWnN3c9RfSHv1AcBtQFjN0A5Dg5J67kchSCwPecQt9XedjL-twscVo4E-wgPxGnuYpu8zLJnR-kUwJEsSceC78d5Knu7lh5U5t2ZVFwPG_SJ-MO_w/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h3/h001.-FXYe7ZSPSJs1kal942NEPKUWV-W9wJH3kmHTjuyYSU" target="_blank" rel="noopener noreferrer nofollow"><span>Gemini 2.5 Pro has moved to a preview version</span></a>, called Gemini 2.5 Pro Preview, and is now available for scaled usage. You can still use it for free on <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoXy3TFQVRsN_o5nO7Vn3nW8lVxPNQUYuQwH5y7pqf7Y532fX_gNIL1ZqkGRRM64mMCOTzgN6SGDDbkH_yBVorHcV3OaX1EGXLSkiVSy0Is82/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h4/h001.nlzU01dlKgEGanOjM0-FMoCbr7G9B0D2YXrayXuhi5k" target="_blank" rel="noopener noreferrer nofollow"><span>Aistudio</span></a> with a rate limit. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:510px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/1d9b07a2-dd63-43f4-9f1c-d6baaf202a20/GnsxzZlXUAA-tZ4.jpg?t=1744144864" alt="" height="auto" width="510" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:510px;"><p>Gemini 2.5 Pro Preview API pricing</p></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 663 </span></span> OpenAI raises $40B at $300B post-money valuation, becoming one of the largest private funding rounds in history. <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoKkcSjnVSYDZmfB2wb3TkkRP0QIqMpmF1KtGaIMW4byay_7sNOGhfOeJoQZ9WgBfBLpY54T-SH_zABi0lmXLmDYQcUbhIHN_INFcm7c6__dVjdcS-GIv2WI8LkUW3nJ_GTE6d-L18DsmRc1wRMbtWI14X7sDa9PfYe5-k9OYKcwT/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h5/h001.qWt_TEAxZnENY0_oIpzgz4rEP1S0Q9K3hTJJzynEiJQ" target="_blank" rel="noopener noreferrer nofollow"><span>Via TechCrunch</span></a></p></li></ol></div></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="transparent" style="background-color:transparent;border-color:#2C81E5;border-style:solid;border-width:5px;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;"><span style="">Support My Newsletter</span></h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="color:rgb(34, 34, 34);font-family:Georgia, "Times New Roman", serif;font-size:16px;">As I aim to keep this newsletter free forever, your support means a lot. If you like reading The AI Timeline, consider forwarding it to another research enthusiast, It helps us keep this up for free!</span></p></td></tr><tr><td align="center" valign="top"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="font-size:0px;line-height:0px;padding:30px 0px 30px;" class="dd"><table class="j" role="none" width="50%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td> </td></tr></table></td></tr><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Share The AI Timeline</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> You currently have <strong>0</strong> referrals. </p></td></tr><tr><td align="left" valign="top" style="padding: 20px 0px 20px 0px; display:none;width:0px;max-height:0px;overflow:hidden;mso-hide:all;height:0;font-size:0;max-height:0;line-height:0;margin:0 auto;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 0;"><tr><td align="center" valign="top" style="width:300px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsGNUqyW5TiZkyMsF1yreu0byy2KW36J1wDdpoLuXg2TU1F1OW8OHoHaU4-ZmrZpPU4RN-crQCEimD190CSn9fPuxpIRojBJyu1VfV5KtQD3QMVdSg2JrjEj5-xm4r4E12Whf08itqPCb9Q5W0X4rt3ubYkqCmWnLeZpmb3_RZcbIk0UE5wZnFLCQJHLFs0qZ0OGpXp89o1HU4mWIBur5Or4tQGm5M_Y8m5PvTEfYfxLRyrcRv7GyVs5oLtFfiySZ2SqtZypLA-h50h61p0uPiA7iA_PiMqlVLtM-87XL33VZi05_O3UTpWE_0nAzFRJ4TW1ayz3_vn4Zlp9IERdbnnAd_1kPLD4lAQcR5PRXgtpC93mG85TXt5CdbsxNjziBUVJscfwOVMMhahhy8eTObn2t9B737YK4waqesjGhT2wWjYmvV4vSfsnurknATq5OxjNCRWAuzb4WO0gW29mc21OXfOnRYKMRy60van_3B3jP4iUQFceGPUP1SWLUF_tpCOm0-9IZHi4jJ5ClSSfmNFiZfUm5HGoPijfdFiHmki9p/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h6/h001.aW2l_YxutqxcqM4Meb55T8nqI_B10IKrhHhpKZqkmto" rel="noopener noreferrer nofollow" style="text-decoration:none;" target="_blank"><img src="" alt="" height="auto" width="300" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></a></td></tr></table></td></tr><tr><td align="left" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:left;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="left" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsGNUqyW5TiZkyMsF1yreu0byy2KW36J1wDdpoLuXg2TU1F1OW8OHoHaU4-ZmrZpPU4RN-crQCEimD190CSn9fPuxpIRojBJyu1VfV5KtQD3QMVdSg2JrjEj5-xm4r4E12Whf08itqPCb9Q5W0X4rt3ubYkqCmWnLeZpmb3_RZcbIk0UE5wZnFLCQJHLFs0qZ0OGpXp89o1HU4mWIBur5Or4tQGm5M_Y8m5PvTEfYfxLRyrcRv7GyVs5oLtFfiySZ2SqtZypLA-h50h61p0uPiA7iA_PiMqlVLtM-87XL33VZi05_O3UTpWE_0nAzFRJ4TW1ayz3_vn4Zlp9IERdbnnAd_1kPLD4lAQcR5PRXgtpC93mG85TXt5CdbsxNjziBUVJscfwOVMMhahhy8eTObn2t9B737YK4waqesjGhT2wWjYmvV4vSfsnurknATq5OxjNCRWAuzb4WO0gW29mc21OXfOnRYKMRy60van_3B3jP4iUQFceGPUP1SWLUF_tpCOm0-9IZHi4jJ5ClSSfmNFiZfUm5HGoPijfdFiHmki9p/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h7/h001.qmPGe8mBlVRYJSak7rP6k3tqPj-XGvpt_jt8YdD7V0Q" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Click to Share </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Or copy and paste this link to others: <a class="link" href="https://mail.bycloud.ai/subscribe?ref=6SqUHb8KiF&_bhlid=7fecfad9eb7fd8bcdb529e945e11346b5897acdc" target="_blank" rel="noopener noreferrer nofollow" clicktracking="off"><span>https://mail.bycloud.ai/subscribe?ref=6SqUHb8KiF</span></a></p></td></tr><tr><td align="center" valign="top" style="font-size:0px;line-height:0px;padding:30px 0px 30px;" class="dd"><table class="j" role="none" width="50%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td> </td></tr></table></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.zNfxTwpJFmrsCuJJphGRkKSrCVph9-fOYkcjx4VfJRwtQQsKrZC8pi-PiKai2fq4lAto9WepTJo69aQJ1T73b1BYaJHeCrLz1cWpFYfpKjdJ071BkzwRo9IrCS5YAIxy/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h8/h001.AGStj2Ck-bw-MP2PcLA8QxPfJxgjsuWG74rWWc4F_p4" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Check Out My Patreon </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoGymQ3NNPtd5dE5MV_8UgjIDFPVXngz8pvQBldSW42yhUe_Qiq6DgEPMEBuPL9yfRpXelTiuu2kS8pLFvsoem_XoZoy_n13sTKUhZIbl0VH6/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h9/h001.dLjycdMs0bDoFqSY2IMuMlvbuL2LH40zm0Qne0s-9Pk" target="_blank" rel="noopener noreferrer nofollow"><span>Advertise with The AI Timeline! </span></a></span></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="multi-token-attention" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Multi-Token Attention </h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Golovneva et al. [FAIR at Meta]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 793 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Attention </span></span></p></td></tr><tr><td id="introduction-to-multi-token-attenti" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to Multi-Token Attention</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> There is a fundamental limitation in traditional attention mechanisms used in large language models, where attention weights are determined by comparing only single query and key token vectors. This "single token attention" restricts the model's ability to identify relevant context that requires multiple token associations. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> To solve this problem, the authors propose <span style="font-weight:700;"><b>Multi-Token Attention</b></span> (MTA), which applies <span style="font-weight:700;"><b>convolution operations across keys</b></span>, queries, and attention heads, allowing neighboring tokens to influence each other's attention weights. This approach enables the model to condition its attention on multiple vector pairs simultaneously, facilitating more precise context location in complex scenarios. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/8f46f93b-494f-4cd5-b265-34837e5ef077/image.png?t=1744135833" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="understanding-multi-token-attention" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Understanding Multi-Token Attention (MTA) Mechanism</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Multi-Token Attention solves a fundamental limitation in traditional attention mechanisms by allowing LLMs to consider multiple tokens simultaneously when deciding where to focus. In standard attention, each attention value depends solely on comparing a single query vector with a single key vector. This creates a bottleneck when the model needs to find content that contains multiple elements together (like a sentence mentioning both "Alice" and "rabbit"). </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> MTA introduces three new steps in this process: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Key-Query Convolution</b></span>: Instead of looking at token pairs in isolation, MTA applies a sliding window (convolution) over the attention matrix before or after the softmax operation. This allows nearby queries and keys to influence each other's attention weights. For example, when searching for "Alice" and "rabbit," this convolution helps the model focus on areas where both appear in proximity. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Head Mixing Convolution</b></span>: MTA groups attention heads together and allows them to share information through another convolution operation. If one head finds "Alice" and another finds "rabbit," this mixing helps combine these findings to locate where both terms appear together. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Group Normalization with Depth Scaling</b></span>: This helps maintain balanced gradients throughout the network, preventing the attention signals from being overwhelmed by the residual stream as they flow through deeper layers. </p></li></ol></div></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/0cf6dd36-ac7c-4b22-9327-37a2e9282b5d/image.png?t=1744135872" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The result is an attention mechanism that can effectively use richer contextual information to locate relevant content. Rather than being limited to what can be encoded in a single vector, MTA enables the model to consider patterns across multiple tokens, making it particularly effective for tasks requiring complex information retrieval from long contexts. </p></td></tr><tr><td id="evaluating-the-performance-of-multi" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Evaluating the Performance of Multi-Token Attention (MTA)</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The Multi-Token Attention mechanism shows impressive <span style="font-weight:700;"><b>performance improvements</b></span> across various tasks. On the motivating toy task, MTA achieved <span style="font-weight:700;"><b>nearly perfect</b></span> results (0.1% error rate) while standard Transformers struggled significantly (31-78% error rates). This shows MTA's fundamental advantage in tasks requiring multi-token information processing. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/ac4bf908-d6cb-48f3-8aee-ec894ab16d49/image.png?t=1744135898" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Validation perplexity for 880M Transformer model on SlimPajama dataset</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> In large language modeling, MTA consistently outperformed baseline models: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Reduced average validation perplexity to 11.09, compared to 11.25 for standard Transformers </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Improved performance on benchmark tasks like LAMBADA (13.6 perplexity vs. 17.6) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Achieved higher average scores across nine popular benchmarks (44.4% vs. 43.7%) </p></li></ul></div></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/9df2a682-68f3-432a-b2a3-6296e16021b2/image.png?t=1744135963" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Multi-needle retrieval accuracy (%) when varying the number of needles (N).</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> MTA particularly excelled at long-context tasks: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> On Needle-In-A-Haystack with 6 needles, MTA achieved 67.0% accuracy after fine-tuning, compared to just 31.9% for standard Transformers </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> On BabiLong question-answering tasks, MTA maintained higher accuracy across various distraction text lengths, especially when the context was filled with 4K tokens of distractions </p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> These improvements came with minimal parameter increase (0.001%) and didn't require applying key-query convolution to all layers - just adding it to 2-6 layers delivered significant gains. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92bknplM2vyqhr9hnM_2A5LJ1YbhI5WwWMN3C8dtGDaN34noHH2KFE8QcHwoYal_Kr9-CaHeL8GYiXqhDnvazZYU/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h10/h001.GQTm5KJTYEiYB9FTNLb8sgvFRKKb-7PZYPUCF7aUcqQ" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="inference-time-scaling-for-generali" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Inference-Time Scaling for Generalist Reward Modeling</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Liu et al. [DeepSeek, Tsinghua University]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 555 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Test Time Compute </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="introduction-to-generalist-reward-m" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to Generalist Reward Modeling</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> There is a big challenge in using reinforcement learning (RL) for large language models: obtaining accurate reward signals across diverse domains beyond just verifiable questions. The authors of this paper propose <span style="font-weight:700;"><b>Self-Principled Critique Tuning</b></span> (SPCT), a new approach that allows reward models to generate adaptive principles and accurate critiques when evaluating responses. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This study tests this method by sampling multiple reward signals in parallel during inference time and using a meta reward model to guide the voting process. This research suggests that effective inference-time scaling techniques may be more efficient than traditional training-time scaling for improving reward modeling in general domains. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/73e1a4d4-0ab0-488e-bb30-69738d2a00cf/image.png?t=1744136105" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Different paradigms for reward generation</p></td></tr></table></td></tr><tr><td id="understanding-self-principled-criti" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Understanding Self-Principled Critique Tuning (SPCT)</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> SPCT is an innovative approach to make reward models better at evaluating AI responses. Traditional reward models simply score responses. SPCT takes a different approach by teaching the model to: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> First generate "principles" - criteria for what makes a good response to a particular query </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Then apply these principles to critique and score responses </p></li></ol></div></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/35c1d7e6-28cb-4e1b-bbb3-f95b5a1f23d9/image.png?t=1744136161" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> SPCT develops this ability in two phases: </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="font-weight:700;"><b>Phase 1: Rejective Fine-Tuning (Cold Start)</b></span></p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> The model learns to generate principles and critiques in the correct format </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> It's trained on examples where it evaluates different numbers of responses </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Low-quality outputs are rejected during training </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Some training includes "hints" about the correct answer to guide learning </p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="font-weight:700;"><b>Phase 2: Rule-Based Reinforcement Learning</b></span></p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> The model generates principles and critiques for various queries </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> It receives positive rewards when its evaluations correctly identify the best response </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> It receives negative rewards when its evaluations are incorrect </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> This teaches the model to develop useful principles that lead to accurate judgments </p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="font-weight:700;"><b>How It Scales During Inference</b></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> What makes SPCT powerful is its ability to improve with more computing power: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Parallel Sampling</b></span>: The model generates multiple sets of principles and critiques for the same query and responses </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Expanded Value Space</b></span>: By combining multiple evaluations, the model can provide more nuanced scores (like 17/40 instead of just 4/10) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Meta Reward Modeling</b></span>: An additional model helps filter out low-quality evaluations before voting </p></li></ol></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The beauty of this approach is that it generates different perspectives (principles) for evaluation automatically, adapting to each specific query. This leads to more accurate and nuanced judgments than traditional reward models. </p></td></tr><tr><td id="results-and-evaluation-of-generalis" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Results and Evaluation of Generalist Reward Modeling</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The benchmark results show that DeepSeek-GRM-27B performs quite well. When using multiple evaluations (scaling at inference time), it <span style="font-weight:700;"><b>outperforms much larger models</b></span> including some with 340 billion parameters and even matches GPT-4o on benchmarks. Generating multiple sets of principles and combining their evaluations (called "voting") significantly improves performance without needing a larger model. Adding a "meta reward model" to filter out low-quality evaluations boosts performance even further. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/0dcd155e-5781-4695-bb94-7f061ad861c3/image.png?t=1744136224" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Overall results of different methods and models on RM benchmarks</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This approach works better than traditional methods that use simple scoring, especially since it avoids biases toward particular types of tasks. Most impressively, their <span style="font-weight:700;"><b>27 billion parameter model</b></span> with 32 evaluation samples <span style="font-weight:700;"><b>performed similarly</b></span> to a massive <span style="font-weight:700;"><b>671 billion parameter model</b></span>, suggesting that smart inference techniques can be more efficient than simply building bigger models. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/faabfdee-ec46-436a-aac1-24e8a8afe479/image.png?t=1744136292" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92aumkiMqqeiXB_3t60bWR-3VHM2t81csA9Ufe7B1r6qHrPAvF6oFOSl1Et53xWauSjeTK0qepsgY6zXuzIv7k1d/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h11/h001.Zdkym9wtPXK_2DJCfBrtOqWKOD0eag_Fmc897jq-76k" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="why-do-ll-ms-attend-to-the-first-to" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Why do LLMs attend to the first token?</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Barbero et al. [University of Oxford, National University of Singapore, Google DeepMind]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 589 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Attention </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> bycloud’s pick </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="introduction-to-attention-sinks" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to "Attention Sinks"</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> When you pass a prompt to LLM, it pays some amount of attention to each part of the sentence. This new paper explores the curious phenomenon of "attention sinks" in LLMs, where heads disproportionately attend to seemingly meaningless tokens (typically the beginning-of-sequence token), with as much as 80% of attention focused there in models like Llama 405B. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Rather than viewing these sinks as a defect, this paper suggests that they serve a crucial functional purpose: preventing "over-mixing" of information. Their theoretical and empirical analysis suggests attention sinks act as a control mechanism that slows down information propagation through the deep transformer architecture. This avoids representational collapse and maintains distinct token representations throughout the network. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/13e61846-3e69-40ab-ae3b-fe32819c813d/image.png?t=1744136334" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Illustration of how attention sinks are usefully leveraged by decoder-only Transformers.</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This perspective helps explain why deeper models and those trained on longer contexts develop stronger sinks, and why this behavior emerges naturally during gradient descent rather than through explicit architectural design. The research connects attention sinks to established concepts like rank collapse and over-squashing, offering a unified framework for understanding this previously puzzling but widespread pattern in modern LLMs. </p></td></tr><tr><td id="understanding-attention-sinks-in-ai" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Understanding "Attention Sinks" in AI Language Models</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> LLMs use a mechanism called "attention sinks," where they direct a substantial portion of their attention to the first token in a sequence. This new research explains that rather than being inefficient, attention sinks serve as an important control valve that prevents "over-mixing" of information. As text flows through the many layers of an AI model, there's a risk that distinct token representations could blend together too much, causing what experts call "representational collapse." The attention sink effectively slows down this mixing process by redirecting attention away from meaningful interactions. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The researchers show that attention sinks form naturally during the training process, and they emerge gradually as models learn to process text. This phenomenon is stronger in larger models and those trained on longer contexts; for instance, the 405B parameter version of LLaMa 3.1 directs nearly 80% of its attention to sinks, compared to just 46% in the 8B parameter version. Interestingly, the sink always forms at the first position regardless of what specific token appears there, though models perform best when using the same beginning token they were trained with. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/538ae9a7-5df1-4a8f-ab57-e342633e5541/image.png?t=1744136420" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The research team verified their theories through both mathematical analysis and empirical experiments. They trained multiple models with different context lengths while keeping the total training tokens constant, confirming that longer-context models develop stronger sinks. They also examined how information propagates through models with and without sinks present, showing that sinks help maintain more distinct token representations throughout the network. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/0c498399-9cdd-437d-a204-d02747affd78/image.png?t=1744136448" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="implications-of-attention-sinks-in-" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Implications of "Attention Sinks" in AI Language Models </span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Understanding attention sinks could lead to more efficient model designs that better control information flow without wasting computational resources. This research connects previously disparate concepts like rank collapse, representational collapse, and over-squashing into a unified framework. It provides deeper insights into how transformer-based architectures function at scale. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 0px 20px 0px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/bffdd951-139c-4dac-97e0-2ebfd034f991/image.png?t=1744136484" alt="" height="auto" width="600" style="margin: 0px 0 0px;display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The study also shows practical applications by examining the LLaMa 3.1 family of models, ranging from 8B to 405B parameters. Their analysis revealed how attention patterns evolve with scale. We now know that architectural decisions in pre-training directly impact how models form these attention sinks. This research advances our understanding of why certain patterns emerge naturally during AI training, potentially guiding the development of future models that can achieve better performance while maintaining computational efficiency. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92bpNqTKUT5KJufy9N3-_u3hk2JW9vs3cl7yWotXRTGeSLnJYbN4plrH04GjMtrM7Vn8vzslpqjK0xsnKYUIOWEO/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h12/h001.i-cyhhcXuaTpCEkJkJOCvwjS9AqRiVORfbXGDOJ0rXQ" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td class="dd" style="padding: 20px;"><table width="100%" cellpadding="0" cellspacing="0" role="none" style="max-width:520px;margin:0 auto;"><tr><td class="q" style="padding:16px 16px 6px 16px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoDDFT6eh5Nsg0xYVQj-h6I3o9m2k79_qw4izMYhmcI36LqlUmknM1zfFELZiVbpRCGECd-qZ32fDG--hqaQ0JhV_sdQO8zmjZ7gjUalDMDz-sMwW-OA3Ika710quC0AHGPGCQbBCXzAcfBNRibwK1eM/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h13/h001.Elvu_i-yIx9dkrdViJ_NVYEMazMfYqKwjmk3muniLZs" style="text-decoration:none !important;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="100%" style="padding: 0 0 14px 0;text-decoration:none;width:100%;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="36" style="width:36px;"><img src="https://pbs.twimg.com/profile_images/1698572487909400576/BvncwnrP_normal.jpg" alt="tw profile: The AI Timeline" style="display:block;width:36px;height:36px;border-radius:50%;border:0;"/></td><td width="400" style="padding:0 0 0 8px;text-decoration:none;"><span style="display:block;font-size:14px;color:#1c2022;font-weight:700;"> The AI Timeline </span><span style="display:block;color:#697882;font-size:14px;"> @TheAITimeline </span></td><td width="24" align="right" style="vertical-align:text-top;"><img width="24" height="24" loading="lazy" alt="tw" style="border:0;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_logo.png"/></td></tr></table></td></tr><tr></tr><tr><td style="word-break:break-word;"><p>🚨This week's top AI/ML research papers:</p><p>- Inference-Time Scaling for Generalist Reward Modeling <br>- Multi-Token Attention <br>- Why do LLMs attend to the first token? <br>- Command A <br>- LLMs Pass the Turing Test <br>- Advances and Challenges in Foundation Agents <br>- PaperBench <br>- Effectively</p></td></tr><tr><td style="padding:12px 0 0 0;"></td></tr><tr><td align="center" style="padding:8px 0 0 0;width:480px;"><img src="https://pbs.twimg.com/media/Gn37ajEWYAA_67H.jpg" width="480" height="auto" style="display:block;border:1px solid #E1E8ED;border-radius:5px;width:100%;max-width:480px;height:auto;"/></td></tr><tr><td height="8" style="line-height:1px;font-size:1px;height:8px;"> </td></tr><tr><td align="left" valign="top" class="s"><p>7:08 PM • Apr 6, 2025</p></td></tr><tr><td height="10" style="line-height: 1px; font-size: 1px; height: 10px;"> </td></tr><tr><td height="1" bgcolor="#e1e8ed" style="line-height:0px;font-size:0px;height:1px;"></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td align="left" valign="top" class="s"><p><b style="color:#1C2022">1.09K</b> Likes <b style="color:#1C2022">114</b> Retweets </p></td></tr><tr><td align="left" valign="top" class="s"><div align="center" style="text-align:center;margin-top:4px;margin-bottom:4px;padding:8px;border:1px solid #ccd6dd;border-radius:9999px;color:#1B95E0"><b>6 Replies</b></div></td></tr></table></a></td></tr></table></td></tr></table></td></tr></table></td></tr><tr><td align="center" valign="top"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><tr><td class="b" align="center" valign="top" bgcolor="#2a2a2a" style="padding:0px 0px 0px 0px;border-style:solid;border-width: 0px 0px 0px 0px;border-color: #2a2a2a;border-bottom-left-radius:10px;border-bottom-right-radius:10px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" bgcolor="#73ddff" style="padding:12px"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><span style="padding-left:1px;"></span></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCQqcWoV4NNHHr5SkP9THApWuHAAlWLQxI3Q_IqFmt_DcyAxeC8jDApCnHmMSBGpBb5sgtimvBYgxRX-Rp7s0F3LjCHoSwdhr83OBqRFhJ1y_/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h14/h001.ca8wjRqLZ_bXtQxjnM6CYrwUWOZ1BRVHg_Ok_-JtR3I" style="text-decoration:none;"><img width="22" height="22" alt="tw" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_dark.png"/></a></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmBoQnQ9VXnB2zTxBG4HeHBgjMqVxpoXRdj01cjwyoVlHgiebEOgBvwHtevoVpsSvpn3Q1di2ml6sb3cBM-X6IStQbj_zQSVGWJ8AAmPw2en2/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h15/h001.N_Hz2Rzvu8d_7uuUXWWvMKoYE6GUfnlWbTa5GwLStzE" style="text-decoration:none;"><img width="22" height="16" alt="yt" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/youtube_dark.png"/></a></td><td><span style="padding-left:1px;"></span></td></tr></table></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td class="w" align="center" valign="top" style="padding:15px 15px 15px 15px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> Update your email preferences or unsubscribe <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsBhEpz-DJgyVFmavJPa0OyKRRnvw4o7XGyvIv7PRofnmGjG7Dz0cgnJuF_PjzNCGI16DilnkN7pJBrK9S1klAhr91ki4ua8pESWzfktFpVK_-RtbmQ9tYodKiY4FwcBMwP1vzXz8cc6ln3iBjqEJVJC1hr9NBUJRgVHOtzCyT68Uwh1WeV5SPIyPvWrQBXJ0fo1KapAwnsdnTTxBlONnJCROLPs2N1GxTREsb8h2kFElaONN9sqg53WK1hCbSl4BAPI7qPBqDz-ICzoNJODR7sqlquUxNSHDbadw_7yLl3Gf0_Vgy4A4ZRzfXo7Ojw4gPV4TeNztXZTVO9T3QJ1UiljZa0Y1fwY8V0zoxHcZjZlfKylen0r65tCiHLNcLiT5SmUSD-li0-pWGeNjSrUpBLuUiCP06vdUHvvWuYquicKqTbpMa0Yp99ZgnHcCvtstlOs3cTB8x4D5_XG2z-FXGh2iQ6tobmX-SomvzdaB4RjgMtGBb9PIs8MNsFnN9ANO0kqK-4Nt-SlkZIMK9E8wocMUQ889-ZMcfD34UfSumEGmZ04Irw_6EmMwqqFze7glBwsgjDzC80_4kJg-XJpVDWNeecALVKODe3b4xYIXnsadUDWNAfwG3FqKpGa4teVxYBemhk77nK53Em2wbl6VCIctScq9hDh7vroJtfsMZnPlm0NPLF_9D5kqysHz0lvWw2rZoP7Q9VnEdaWaC1kgSak1V4eHf38PTtSpHKHILLzGtn9LRf0RQb2gE6A2cWi0yg/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h16/h001.AKRk35tuItbQwys1BTrQP86WGwG_RDDTinCIU0fgQI8" style="text-decoration:underline;text-decoration-color:#FFFFFF!important;color:#FFFFFF!important;"> here</a></p><p class="copyright" style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> © 2025 bycloudai </p><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> 228 Park Ave S, #29976, New York, New York 10003, United States </p></td></tr><tr style="display: table-row !important;"><td align="center" valign="top" style="padding-top:20px;" style="display:table-cell !important;"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="display:table !important;"><tr style="display:table-row !important;"><td class="u" align="center" valign="middle" height="32" style="height:32px;display:table-cell !important; max-height: 32px !important;margin:0px !important; background-color: #ffffff !important;"><a style="line-height:32px !important;text-decoration:none;display:block !important;" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28olDWFpV5DDKfdk_OdOKOj8o-vUCs8NaKyRkCkPyLs-VzQhzXDBRGBCFUdJ0QOvtgUfnoxcahCF-j1Sr-WAtvmpQZcd7WLnoteYcY-V4c3gg_LpUed7YiXCkY1OtQGRzKYDK8xF1tpgF9pohR1-JAtvrr6borx2eajWR_ShdZT7kzIQirGf6CL15shNvhByeep76vWSqKkj1Nss9pogT-QyZLhB2SXrcTG7kHQ22IT8/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h17/h001.GsBIVIYq_YNcurI9Bp-qm3KX2mrmiFXyJ-m8t8oVQj0"><img src="https://media.beehiiv.com/output-onlinepngtools.png" width="16" alt="beehiiv logo" style="display:inline-block !important;max-width:16px !important; vertical-align:-3px !important;width: 16px !important;" border="0"/><span style="padding-left:11px !important;display: inline-block !important;">Powered by beehiiv</span></a></td></tr></table></td></tr><tr><td align="left" valign="top" height="2" style="height:2px;"><a href='https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWsHIaP4XNp0WgUYqLvHcKk_3uqk_KIkz4ddLinhFbud6JuxLFdSUhYnR7b1NSsmbtzXNGNblnEEMKUtkCAjkn8Y/4fh/-N3GhbeyQjqJYdVRbqj4NQ/h18/h001.1QgfhLlrGr8kRzIThCH7hvns4ipZyMTaQH8XQFvo6Es' style="color: #2a2a2a !important; cursor: default; font-size: 1px; text-decoration: none;"> Terms of Service </a></td></tr></table></td></tr></table></td></tr></td></tr></table></td></tr></table></td></tr></table></td></tr></table></div></body></html>