<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" style="font-size:16px;"><head></head><head><meta charset="utf-8"/><!--[if !mso]><!--><meta http-equiv="X-UA-Compatible" content="IE=edge"/><!--<![endif]--><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="x-apple-disable-message-reformatting"/><meta name="format-detection" content="telephone=no,address=no,email=no,date=no,url=no"/><meta name="color-scheme" content="light"/><meta name="supported-color-schemes" content="light"/><title>Dynamic Chunking, Small Batch Size Training, and more...</title><!--[if mso]><xml><o:OfficeDocumentSettings><o:AllowPNG/><o:PixelsPerInch>96</o:PixelsPerInch></o:OfficeDocumentSettings></xml><![endif]--><style> :root { color-scheme: light; supported-color-schemes: light; } body { margin: 0; padding: 0; min-width: 100%!important; -ms-text-size-adjust: 100% !important; -webkit-transform: scale(1) !important; -webkit-text-size-adjust: 100% !important; -webkit-font-smoothing: antialiased !important; } .body { word-wrap: normal; word-spacing:normal; } table.mso { width: 100%; border-collapse: collapse; padding: 0; table-layout: fixed; } img { border: 0; outline: none; } table { mso-table-lspace: 0px; mso-table-rspace: 0px; } td, a, span { mso-line-height-rule: exactly; } #root [x-apple-data-detectors=true], a[x-apple-data-detectors=true], #MessageViewBody a { color: inherit !important; text-decoration: inherit !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important; } span.MsoHyperlink { color: inherit !important; mso-style-priority: 99 !important; } span.MsoHyperlinkFollowed { color: inherit !important; mso-style-priority: 99 !important; } .a { background-color:#dedede; } .b { background-color:#2a2a2a; } .c { background-color:#ffffff; } .d { background-color:#fff0c8; } .d2 { background-color:#FFFFFF; } .d3 { background-color:#FFFFFF; } h1 a { text-decoration:none;color:#2C81E5;font-style:italic; } h2 a { text-decoration:none;color:#2C81E5;font-style:italic; } h3 a { text-decoration:none;color:#2C81E5;font-style:italic; } h4 a { text-decoration:none;color:#2C81E5;font-style:italic; } h5 a { text-decoration:none;color:#2C81E5;font-style:italic; } h6 a { text-decoration:none;color:#2C81E5;font-style:italic; } h1, h1 a, h2, h2 a, h3, h3 a, h4, h4 a, h5, h5 a, h6, h6 a, ul, li, ol, p, p a { margin: 0;padding: 0; } h1 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:28px;color:#2A2A2A;line-height:42px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h2 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:24px;color:#2A2A2A;line-height:36px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h3 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:20px;color:#2A2A2A;line-height:30px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h4 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:18px;color:#2A2A2A;line-height:27px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h5 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:16px;color:#2A2A2A;line-height:24px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h6 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:14px;color:#2A2A2A;line-height:21px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } p { font-family:'Georgia','Times New Roman',serif;font-weight:400;color:#2D2D2D;font-size:16px;line-height:24px;padding-bottom:8px;padding-top:8px;mso-margin-top-alt:8px;mso-margin-bottom-alt:8px; } p a, .e a, ul a, li a, .h a, .h2 a, .h3 a { word-break:break-word;color:#2C81E5 !important;text-decoration:none;font-style:italic; } p a span, .e a span, ul a span, li a span { color: inherit } p .bold { font-weight:bold;color:#2D2D2D; } p span[style*="font-size"] { line-height: 1.6; } .f p { font-size:12px;line-height:15px;color:#2D2D2D;padding:0; } .f p a { color:#2D2D2D !important; } .g p { font-family:'Helvetica',Arial,sans-serif;font-size:14px;line-height:20px;font-weight:normal;margin:0; } .g p a { text-decoration: underline; } .i p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i p a { color:#2D2D2D !important; } .i2 p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i2 p a { color:#2D2D2D !important; } .i3 p { font-family:'Helvetica',Arial,sans-serif;line-height:43px;font-size:24px;color:#2D2D2D; } .i3 p a { color:#2D2D2D !important; } .h p a { color:#595959 !important; } .h2 p a { color:#595959 !important; } .h3 p a { color:#595959 !important; } .f p a, .i p a, .i2 p a, .i3 p a, .h p a, .h2 p a, .h3 p a { text-decoration:underline; } .j { border-top:3px solid #ffeb2d; } .k p { padding-left:15px;padding-bottom:0px;padding-top:6px;mso-margin-top-alt:6px;mso-margin-bottom-alt:0px;mso-margin-left-alt:15px; } .o { background-color:#FFFFFF;border:1px solid #F1F1F1;border-radius:5px; } .o p { font-family:'Helvetica',Arial,sans-serif;padding:0px;margin:0px; } .l p, .l p a, .l a { font-size:14px;line-height:20px;font-weight: bold;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .m p, .m p a { font-size:13px;line-height:18px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .n p, .n p a { font-size:12px;line-height:17px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .p { background-color:#FFFFFF;max-width:520px;border:1px solid #E1E8ED;border:1px solid rgba(80, 80, 80, 0.3);border-radius:5px; } .q { font-size:16px;font-family:Helvetica,Roboto,Calibri,sans-serif !important;border:1px solid #e1e8ed;border:1px solid rgba(80, 80, 80, 0.3);border-radius:10px;background-color:#FFFFFF; } .q p { font-size:16px;font-family:system-ui,Helvetica,Roboto,Calibri,sans-serif !important;color:#222222;padding:4px 0; } .r { border:1px solid #E1E8ED !important;border-radius:5px; } .s p { font-size: 14px; line-height: 17px; font-weight: 400; color: #697882; text-decoration: none; } .t p { font-family:'Helvetica',Arial,sans-serif;font-size:12px;line-height:18px;font-weight:400;color:#000000;font-style:italic;padding:4px 0px 0px; } .v { border-radius:10px;border:solid 0px #DFD150;background-color:#2C81E5;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;color:#FFFFFF; } .v a { text-decoration:none;display:block;color:#FFFFFF; } .w p { font-size:12px;line-height:15px;font-weight:400;color:#FFFFFF; } .w p a { text-decoration: underline !important;color:#FFFFFF !important; } ul { font-family:'Helvetica',Arial,sans-serif;margin:0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:disc;font-size:16px; } ul > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:disc; } ol { font-family:'Helvetica',Arial,sans-serif;margin: 0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:decimal;font-size:16px; } ol > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:decimal; } .e h3, .e p, .e span { padding-bottom:0px;padding-top:0px;mso-margin-top-alt:0px;mso-margin-bottom-alt:0px; } .e span, .e li { font-family:'Helvetica',Arial,sans-serif;font-size:16px;color:#2D2D2D;line-height:24px; } .rec { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji" !important; } .rec__button:hover { background-color: #f9fafb !important; } .copyright a {color: inherit !important; text-decoration: none !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important;} .txt_social p { padding: 0; word-break: break-all; } .table, .table-c, .table-h { border: 1px solid #C0C0C0; } .table-c { padding:5px; background-color:#FFFFFF; } .table-c p { color: #2D2D2D; font-family:'Helvetica',Arial,sans-serif !important;overflow-wrap: break-word; } .table-h { padding:5px; background-color:#F1F1F1; } .table-h p { color: #2A2A2A; font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif !important;overflow-wrap: break-word; } @media only screen and (max-width:667px) { .aa, .w100pc { width: 100% !important; } .bb img { width: 100% !important; height: auto !important; max-width: none !important; } .cc { padding: 0px 8px !important; } .ee { padding-top:10px !important;padding-bottom:10px !important; } .ff ul, .ff ol { margin: 0px 0px 0px 10px !important;padding: 0px !important; } .ff li { margin:10px 0px 0px 10px !important; } .r {height:140px !important;} .s p { font-size:13px !important;line-height:15px !important; } .mob-hide {display:none !important;} .mob-show {display: block !important; width: auto !important; overflow: visible !important; float: none !important; max-height: inherit !important; line-height: inherit !important;} .mob-stack {width:100% !important;display:block !important;} .mob-w-full {width:100% !important;} .mob-block {display:block !important;} .embed-img {padding:0px 0px 12px 0px !important;} .socialShare {padding-top:15px !important;} .rec { padding-left:15px!important;padding-right:15px!important; } .bodyWrapper { padding:7px 4px 7px 4px !important; } .social-mobile {float:left !important;margin-top:10px !important;} } @media screen and (max-width: 480px) { u + .a .gg { width: 100% !important; width: 100vw !important; } .tok-heart { padding-top:75% !important; } .tok-play { padding-top: 250px !important; } } @media screen and (max-width: 320px) { .tok-heart { padding-top:65% !important; } } .u { border: 1px solid #CACACA !important; border-radius: 2px !important; background-color: #ffffff !important; padding: 0px 13px 0px 13px !important; font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif !important;font-size: 12px !important; color: #767676 !important; } .u a { text-decoration: none; display: block !important; color: #767676 !important; margin: 0px !important; } .u span, .u img { color: #767676 !important;margin:0px !important; max-height:32px !important;background-color:#ffffff !important; } </style><!--[if mso]><style type="text/css"> h1, h2, h3, h4, h5, h6 {font-family: Arial, sans-serif !important;} body, table, td, p, a, span {font-family: Arial, sans-serif !important;} sup { font-size: 100% !important;vertical-align: .5em !important;mso-text-raise: -1.5% !important;line-height: 0 !important; } ul { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ul li { margin-left: 0px !important; mso-special-format: decimal; } ol { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ol li { margin-left: 0px !important; mso-special-format: decimal; } li.listItem { margin-left:15px !important; margin-top:0px !important; } .paddingDesktop { padding: 10px 0 !important; } .edm_outlooklist { margin-left: -20px !important; } .embedImage { display:none !important; } </style><![endif]--><style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-display: swap; src: url('https://fonts.gstatic.com/s/opensans/v40/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4gaVIUwaEQbjA.woff2') format('woff2'); } @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 700; font-display: swap; src: url('https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@1,700&display=swap') format('woff2'); } </style></head><body class="a" style="margin:0px auto;padding:0px;word-wrap:normal;word-spacing:normal;background-color:#dedede;"><div role="article" aria-roledescription="email" aria-label="email_name" lang="en" style="font-size:1rem"><div style="display:none;max-height:0px;overflow:hidden;"> Dive into the latest AI research and industry news, featuring Pollen Robotics' Reachy Mini, Grok 4's launch, and groundbreaking developments in open-source robotics and AI technologies.  ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ </div><table role="none" width="100%" border="0" cellspacing="0" align="center" cellpadding="0" class="gg"><tr><td align="center" valign="top"><table role="none" width="670" border="0" cellspacing="0" cellpadding="0" class="aa" style="width:670px;table-layout:fixed;"><tr><td class="bodyWrapper" align="center" valign="top" style="padding:7px 7px 7px 7px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="border-width:0px 0px 0px 0px;border-style: solid; border-color: #2a2a2a;border-radius:10px 10px 0px 0px;background-color:#ffffff;" class="c"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr id="header"><td style="padding:28px 28px 0px 28px;"><div style="padding-top:0px;padding-right:0px;padding-bottom:20px;padding-left:0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td class="f" align="right" valign="top"><p> July 16, 2025 | <a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxSdB5RCIH6yy1Fm1CYma3ExDHB1wNy-aAPOlXvcBBYN2oh4VRWjTwHhMSwJpGVTRdxPDcx-jUwWXbOqs_SCFrArw7Ke8rgHDr5A_Mt94Glmo5yPwFYECV9cogC9XOO9gktf8TcR2d8WCI5rJwa1EiR_TaueM9Tqw-Az-LjjEYyQmkvl4uRd8QzjTSPFagyS8CYnwbiiwspd6Ujk-Yk3IrVRCMzp1yKhbBRUTAr2L5CbMrofJH-2_OiFxrVQS96R0CcFXsLFq4bhV6J4BLTt-sY9fGiJc-2FP-m_NsNW5okKGlFHZ3YnO5egtMCyopZZyqh17EyfDqK7ia3YOgs-dNLdexSrihh2QW7YN7L3cdK7_J7VRAt4xRjMfWwjRGXccty_bPS_z_GQry2bjJSmr7h3AiSxnL3Gg-tKQaaLa-TaQ8jeXrFHbDr4nnUTJr0BuXX_CxhlS2Fwyi4TCPKIqy1h-26FkA7nvoqkr-wEOyVHIf0DXKyDVfE_sGeHcyNz9eOEexDmhmTAdU0cMOWn1RL2KCfrb0yS9rFEIpvU8eNiCby_10Oj1v7WAJ296TJCGzYZEPyPY1oBj65ESCwwWCPoPSCJLSwYqhPARXThEqr1wcfcKANhEufRdPuA8mMkKIS1SmiTAOesSGG3UnL6-KNLR5P9bQ4nw3rPgteIwDtPvkSOUSMS0LkvB3Wpj4XyNAA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h0/h001.UHTG6-pW2RP2ckuE6xqOoEQ1Wimm4NawoeXFLEyjiPY">Read Online</a></p></td></tr><tr><td class="dd" align="center" valign="top" style="padding:15px 0;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><h1 style="text-align:left;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;font-weight:Bold;font-size:32px;color:#2A2A2A;padding:2px 0;line-height:38px;"> Dynamic Chunking, Small Batch Size Training, and more... </h1><p style="text-align:left;font-family:'Helvetica',Arial,sans-serif;font-weight:normal;font-size:20px;color:#3E3E3E;padding:5px 0;line-height:24px;"> Dive into the latest AI research and industry news, featuring Pollen Robotics' Reachy Mini, Grok 4's launch, and groundbreaking developments in open-source robotics and AI technologies. </p></td></tr></table></td></tr><tr><td style="height:0px;width:0px;"><div style="height:1px;" data-open-tracking="true"> <img src="https://elink4f7.mail.bycloud.ai/ss/o/u001.3wmUuY8gEWd4_869a_eXcg/4i8/qagMnS_rQxW_F_M9Wl6Jrw/ho.gif" alt="" width="1" height="1" border="0" style="height:1px !important;width:1px !important;border-width:0 !important;margin-top:0 !important;margin-bottom:0 !important;margin-right:0 !important;margin-left:0 !important;padding-top:0 !important;padding-bottom:0 !important;padding-right:0 !important;padding-left:0 !important;"/> </div></td></tr></table></div></td></tr><tr id="content-blocks"><td class="email-card-body" align="center" valign="top" style="padding-bottom:28px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td id="nov-18-th-nov-24-th-33-latest-ai-re" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h6 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:87.5%;"><i>July 7th ~ July 13th</i><br><i>#64 Latest AI Research Explained Simply</i></h6></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="industry-news-in-1-line" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;">🗞️ Industry News in 1 Line</h2></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 5.2k</span></span> The Pollen Robotics team at Hugging Face has <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWtgZp_0kUt97ATPOljjaBHbfM4X_nVcYYw2MGzQH_msUiqVJo5uPyLtZBhm_wBsyrFjNMMy5dyiWhbE6JOXWNnjRCKeIrRXJGl6XFqAR4ZOi/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h1/h001.FvQNe0zzYJZ1dHsl7PORjENKqBRV-h77DZSMD2Awq5Q" target="_blank" rel="noopener noreferrer nofollow"><span>introduced Reachy Mini, an open-source desktop robot</span></a> designed for creative coding, AI experimentation, and human-robot interaction. </p><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"> The robot encourages community engagement by supporting Python programming, multimodal sensing, and integration with Hugging Face models, making it accessible for developers, educators, and enthusiasts worldwide. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:524px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/3db1917d-4e1d-419b-83f4-9c5c974bc66c/uEa13KsL5wtQREVZ1ixwc.png?t=1752594107" alt="" height="auto" width="524" style="display:block;width:100%;" border="0"/></td></tr></table></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 30k</span></span> Elon Musk had a pretty busy week as <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCcCeRaaWB8FVX5uoo6YyYJM-Pf9eOy9np5rVxth-1uzEr2m-O50Mz3K4xhHxJ5DnsaS5Xm3R44YZOClh_GcEeKUYhXEDnJYvkyrWEnD4h3n_BWydd-uXaVHDWB1aSR3ZUA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h2/h001.g-pPbGh6pW6PEwWXie1Nbo88NMFmhwZKsyT_2_QrZP0" target="_blank" rel="noopener noreferrer nofollow"><span>xAI has launched Grok 4</span></a>, a new iteration of its chatbot, just days after the tool sparked controversy with offensive messages. Elon also announced a $200 million <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j281rnMGC7GbNZiHXD10rIEzdKoDIlfhN9xfr4FZNHY2HYzw8zUw8qLsvF18Sqlth45z-nDT8GtF1qH3-cStd_b6O1XzOulaxkXsk5tIN1RuGlSpH-CURNJ9rhj-XS-RNWaZdPcXcSE6tfbUl5J3HkjSkuAEyXfOfPU2wSYN4j_-tAd0q7XevGG_6Wq_Ewx8ClXyBN63h26ZmJPt2nYvxda4/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h3/h001.7KD9hhu254TgFAbmSegiV-RC9GFnaTy6p-j_bzmojtA" target="_blank" rel="noopener noreferrer nofollow"><span>U.S. Department of Defense contract to provide AI services through its Grok</span></a> model for Government program, as part of a broader federal initiative to accelerate national security AI adoption. </p></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 6.5k</span></span> <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxSGWTNhdnM83-a93AHZ1JsgEav8L5dtqpSzDQ5sO3_YeUO_huGADrzUl74HvQoq1S6CMTZnKFEkcQvvyDGblZvdkLhNFHsO1IvEA2teVoAIm/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h4/h001.2QcJrK0AUvbbLhBAWeZOZ2PIrFvVYINGExWYqvFOldE" target="_blank" rel="noopener noreferrer nofollow"><span><b>Kimi K2 is a new Mixture-of-Experts model</b></span></a> with 32 billion active parameters and 1 trillion total parameters. Is has state-of-the-art capabilities in knowledge, math, and code with reflex-grade agentic performance. </p><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"> Developers and researchers can now <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.zNfxTwpJFmrsCuJJphGRkO3UtkflFAGe5v9gWjga7B7ALfCm-HzzOXvsPXT_nYqz3hZX6lUAvRMxydfRtkB1lihb7l3ZOueQLgjKZqoaxF_aKFKAi5v3hHUaHNlCkx7j8smhiqoFMaW72DzdlzlSooPl2jF9Tnv4meeZuypIs60/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h5/h001.hamdiLmY6FOmFIFttfzh8H_w60TQSsu12RhKtLJQKyY" target="_blank" rel="noopener noreferrer nofollow"><span>access Kimi-K2 via API</span></a> or <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWpwuXiC8bRxIIctJGlFDhSexQTN94gQ9-xL5dFIvuVomZYLgMFxnMEPJHdJ8hpk-oe1aa_0ReFuqJYAHa1D9fGICrj06nLksABlx_-woSXYa/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h6/h001.IigdTwoyghmGiaMejeC0HldaSDje5GUVysycW5_cXyo" target="_blank" rel="noopener noreferrer nofollow"><span>download Kimi-K2-Instruct weights from HuggingFace</span></a>. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:524px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/a8514a15-2f9a-4d8a-ba2f-126c47f66581/image.png?t=1752603404" alt="" height="auto" width="524" style="display:block;width:100%;" border="0"/></td></tr></table></li></ol></div></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="transparent" style="background-color:transparent;border-color:#2C81E5;border-style:solid;border-width:5px;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;"><span style="">Support My Newsletter</span></h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style="color:rgb(34, 34, 34);font-family:Georgia, "Times New Roman", serif;font-size:16px;">As I aim to keep this newsletter free forever, your support means a lot. If you like reading The AI Timeline, consider forwarding it to another research enthusiast, It helps us keep this up for free!</span></p></td></tr><tr><td align="center" valign="top"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="font-size:0px;line-height:0px;padding:30px 0px 30px;" class="dd"><table class="j" role="none" width="50%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td> </td></tr></table></td></tr><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;">Share The AI Timeline</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> You currently have <strong>0</strong> referrals. </p></td></tr><tr><td align="left" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; display:none;width:0px;max-height:0px;overflow:hidden;mso-hide:all;height:0;font-size:0;max-height:0;line-height:0;margin:0 auto;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 0;"><tr><td align="center" valign="top" style="width:328px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsGNUqyW5TiZkyMsF1yreu0byy2KW36J1wDdpoLuXg2TU1F1OW8OHoHaU4-ZmrZpPU4RN-crQCEimD190CSn9fPuxpIRojBJyu1VfV5KtQD3QMVdSg2JrjEj5-xm4r4E12Whf08itqPCb9Q5W0X4rt3ubYkqCmWnLeZpmb3_RZcbIk0UE5wZnFLCQJHLFs0qZ0OGpXp89o1HU4mWIBur5Or4tQGm5M_Y8m5PvTEfYfxLRyrcRv7GyVs5oLtFfiySZ2SqtZypLA-h50h61p0uPiA7iA_PiMqlVLtM-87XL33VZi05_O3UTpWE_0nAzFRJ4TW1ayz3_vn4Zlp9IERdbnnAd_1kPLD4lAQcR5PRXgtpCEt1vpnfntroCObuODCHKypeou0x3P4dnGYlWNtKGqk__lYTvm4BaORwEWIbfLIieTdhfr4EPzh1J_85OkgJVMVNQFhH2hlDaJ1n0FirBvcUfeLt3fUqxEiDAweOLBeahOK65E6KASmcPGoT4xcJEi9qTnNEkMXJQbXEXEXF8Szcu8zG8UpKOcni7ndTXHHLA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h7/h001.mCKNJQSV5_8dlADVRgC1zQGk4XnCyhHMbhC0wT_3Qi4" rel="noopener noreferrer nofollow" style="text-decoration:none;" target="_blank"><img src="" alt="" height="auto" width="328" style="display:block;width:100%;" border="0"/></a></td></tr></table></td></tr><tr><td align="left" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:left;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="left" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsGNUqyW5TiZkyMsF1yreu0byy2KW36J1wDdpoLuXg2TU1F1OW8OHoHaU4-ZmrZpPU4RN-crQCEimD190CSn9fPuxpIRojBJyu1VfV5KtQD3QMVdSg2JrjEj5-xm4r4E12Whf08itqPCb9Q5W0X4rt3ubYkqCmWnLeZpmb3_RZcbIk0UE5wZnFLCQJHLFs0qZ0OGpXp89o1HU4mWIBur5Or4tQGm5M_Y8m5PvTEfYfxLRyrcRv7GyVs5oLtFfiySZ2SqtZypLA-h50h61p0uPiA7iA_PiMqlVLtM-87XL33VZi05_O3UTpWE_0nAzFRJ4TW1ayz3_vn4Zlp9IERdbnnAd_1kPLD4lAQcR5PRXgtpCEt1vpnfntroCObuODCHKypeou0x3P4dnGYlWNtKGqk__lYTvm4BaORwEWIbfLIieTdhfr4EPzh1J_85OkgJVMVNQFhH2hlDaJ1n0FirBvcUfeLt3fUqxEiDAweOLBeahOK65E6KASmcPGoT4xcJEi9qTnNEkMXJQbXEXEXF8Szcu8zG8UpKOcni7ndTXHHLA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h8/h001.3GIT0DrDXKj6EN6kC-cKlwxjwYrr2c-KNukD8lTFQBo" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Click to Share </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Or copy and paste this link to others: <a class="link" href="https://mail.bycloud.ai/subscribe?ref=6SqUHb8KiF&_bhlid=bf7a73b936aab597b0df9777ef50b28c5a049d32" target="_blank" rel="noopener noreferrer nofollow" clicktracking="off"><span>https://mail.bycloud.ai/subscribe?ref=6SqUHb8KiF</span></a></p></td></tr><tr><td align="center" valign="top" style="font-size:0px;line-height:0px;padding:30px 0px 30px;" class="dd"><table class="j" role="none" width="50%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td> </td></tr></table></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.zNfxTwpJFmrsCuJJphGRkKSrCVph9-fOYkcjx4VfJRwtQQsKrZC8pi-PiKai2fq4lAto9WepTJo69aQJ1T73b1BYaJHeCrLz1cWpFYfpKjdJ071BkzwRo9IrCS5YAIxy/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h9/h001.W43PEwP00IRpAKDB4vyBdTihUtiv47DhTKgsnVBZsDk" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Check Out My Patreon </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoGymQ3NNPtd5dE5MV_8UgjIDFPVXngz8pvQBldSW42yhUe_Qiq6DgEPMEBuPL9yfRpXelTiuu2kS8pLFvsoem_XoZoy_n13sTKUhZIbl0VH6/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h10/h001.zwZAFFvADHxU1WMLh6xsN5d5JOj1Njes0aJBa7pi0yY" target="_blank" rel="noopener noreferrer nofollow"><span>Advertise with The AI Timeline! </span></a></span></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="dynamic-chunking-for-endto-end-hier" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;">Dynamic Chunking for End-to-End Hierarchical Sequence Modeling</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style=""><i>Hwang et al. [Carnegie Mellon University, Cartesia AI]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 1.1k </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Chunking </span></span></p></td></tr><tr class="embed-gen-img-r"><td align="center" valign="top" style="padding:12px 12px 12px 12px;" class="dd"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" class="o" style="padding:12px 12px 12px 12px;;background-color:#FFFFFF;border-color:#F1F1F1;border-radius:5px 5px 5px 5px;border-width:1px 1px 1px 1px;"><!--[if !mso]><!--><div style="display:none; float:left; overflow:hidden; width:0; max-height:0; line-height:0;" class="mob-show"><table role="none" border="0" cellspacing="0" cellpadding="0" align="right" width="100%"><tr><td align="center" valign="top"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJRchKnjrXjcBO0EkVN1P_oXMpEpwm9KoGwyiyU4qU-HQ2P_oOTQjWiwI95tVF_7yyqmntDA0h6whCt8N0F010cnxL5C58-sWXdtfBa9iLdPN/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h11/h001.inqInPnnrXzJJdsyH-5e7K5sSW04Ily-ia9WHTTsjDg" target="_blank"><img src="https://opengraph.githubassets.com/e96ab7041f47a25db201d682de2da2d83e3ac857127acfbfceb68ebf5d19ac6a/goombalab/hnet" width="100%" style="height:auto;display:block;"/></a></td></tr><tr><td height="16" style="font-size:16px;line-height:16px;"> </td></tr></table></div><!--<![endif]--><table role="none" border="0" cellspacing="0" cellpadding="0" align="right" width="100%"><tr><td width="57%" align="center" valign="middle" class="mob-stack"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="left" valign="middle" class="l"><p><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJRchKnjrXjcBO0EkVN1P_oXMpEpwm9KoGwyiyU4qU-HQtJuJYx5zUppgQEXkBWXgiz8xENIxiGaF1jy-b55OVl2Ya8kriko3r6COPrpP5ILj/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h12/h001.J-xIrfvsr71ldAxDyavLvNMV0SKqhO7cm9EBU3V3cUs" style="text-decoration:none;font-style:normal;color:#2D2D2D !important;font-size:14px;line-height:20px;" target="_blank"> GitHub - goombalab/hnet: H-Net: Hierarchical Network with Dynamic Chunking <tr><td align="left" valign="top" class="m"><p style="font-size:13px;line-height:19px;color:#2D2D2D;"> H-Net: Hierarchical Network with Dynamic Chunking. Contribute to goombalab/hnet development by creating an account on GitHub. </p></td></tr><tr><td align="left" valign="bottom" class="n" style="vertical-align:bottom;padding-top:12px;"><p style="word-break:break-word;">github.com/goombalab/hnet</p></td></tr></a></p></td></tr></table></td><td width="3%" style="font-size:16px;line-height:16px;" class="mob-hide"> </td><td width="40%" align="left" valign="top" class="mob-hide"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJRchKnjrXjcBO0EkVN1P_oXMpEpwm9KoGwyiyU4qU-HQwLdYBz1YmyKgVzGN7jAhn1hoE30tbCSAxLSnU-mNpRPO6bWLAYJQxL-XcmeJU9QA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h13/h001.q6Ye9FyZWN2Je4GM7Q49F4YFiEblx-bU8sBfOoLqdw8" target="_blank"><img src="https://opengraph.githubassets.com/e96ab7041f47a25db201d682de2da2d83e3ac857127acfbfceb68ebf5d19ac6a/goombalab/hnet" width="242" style="height:auto;display:block;"/></a></td></tr></table></td></tr></table></td></tr><tr><td id="introduction-to-endto-end-sequence-" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Introduction to End-to-End Sequence Modeling Without Tokenization</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Language models rely on tokenization, which is a handcrafted preprocessing step that converts raw text into predefined chunks. This tokenization step creates barriers for true end-to-end learning as it limits character-level understanding and struggles with languages lacking clear segmentation cues. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> This paper introduces the H-Net architecture which addresses this issue by introducing <span style="font-weight:700;"><b>dynamic chunking</b></span>, a method that learns to segment raw data (like bytes) into meaningful units during training. This replaces the traditional tokenization-LM-detokenization pipeline with a single hierarchical model, enabling more robust and efficient learning directly from unprocessed inputs. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/b03a7ca6-b91d-4117-84ac-827bdd09534c/image.png?t=1752595943" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:656px; padding: 4px 0px 4px 0px;"><p>Architectural overview of H-Net with a two-stage hierarchical design (𝑆=2)</p></td></tr></table></td></tr><tr><td id="inner-workings-of-h-net" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Inner Workings of H-Net</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> H-Net processes data through a U-Net-like hierarchy with three core components. First, a lightweight <span style="font-weight:700;"><b>encoder network</b></span> handles fine-grained details from raw inputs. Second, a <span style="font-weight:700;"><b>main network</b></span> operates on compressed representations, resembling tokens but learned dynamically. Third, a <span style="font-weight:700;"><b>decoder network</b></span> reconstructs the original sequence resolution. The key innovation is the <span style="font-weight:700;"><b>dynamic chunking mechanism</b></span> between these stages. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> A <span style="font-weight:700;"><b>routing module</b></span> predicts boundaries between adjacent elements using cosine similarity: when consecutive vectors differ significantly (e.g., at word breaks), it flags a boundary. This replaces fixed heuristics with context-aware decisions. A <span style="font-weight:700;"><b>downsampler</b></span> then compresses sequences by retaining only boundary-marked vectors, shortening the sequence for efficient processing in the main network. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/2a4f6076-426c-4d52-84f7-7d72b99e5e98/image.png?t=1752596060" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:656px; padding: 4px 0px 4px 0px;"><p>Comparison of decompression strategies on the example sequence</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> To overcome training instability from discrete boundary decisions, a <span style="font-weight:700;"><b>smoothing module</b></span> interpolates uncertain chunks using weighted combinations of neighboring vectors. This maintains gradient flow during backpropagation. Additionally, a <span style="font-weight:700;"><b>ratio loss</b></span> function ensures balanced compression, preventing trivial solutions like retaining all inputs or over-compressing. </p></td></tr><tr><td id="evaluation-and-impact-of-h-net" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Evaluation and Impact of H-Net</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> H-Net <span style="font-weight:700;"><b>outperforms token-based models</b></span> across multiple benchmarks when matched for compute. On English text, a single-stage byte-level H-Net surpasses BPE-tokenized Transformers in perplexity and downstream tasks. With two hierarchical stages, it matches the performance of Transformers twice its size after just 30B training bytes. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/2bd9bcd6-1584-4cbf-9b45-c659dbd86cda/image.png?t=1752596115" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:656px; padding: 4px 0px 4px 0px;"><p>Architectures for main language models, all data-/FLOP-matched.</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> The model shows significant gains on noisy text benchmarks and languages with weak tokenization heuristics. For Chinese and code, it improves accuracy by 6.4 points on XWinograd-zh; for DNA sequences, it achieves nearly <span style="font-weight:700;"><b>4× better data efficiency</b></span>. Learned boundaries align with linguistic units (e.g., morphemes), validating its ability to discover structure without supervision. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/dda2fb5c-99bd-4c80-a7f2-9f2500b9cbb2/image.png?t=1752596146" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92aeme_nNrK6RBW0y6_8N0HwgjLKZ07qc1EUvEbSUO7-cktOEsd6bbBN0Lt2J-vGd0o-gDhWvWjpISRscVcOk1YT/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h14/h001.zsZCNQFkO_svM1eogw2aeW3OFjsz2dAPVcx2SbBD66c" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="energy-based-transformers-are-scala" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;">Energy-Based Transformers are Scalable Learners and Thinkers</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style=""><i>Gladstone et al. [UVA, UIUC, AmazonGenAI, StanfordUniversity, Harvard University]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 3.9k </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> Transformers </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr class="embed-gen-text"><td align="center" valign="top" style="padding:12px 12px 12px 12px;" class="dd"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" class="o" style="padding:12px 12px 12px 12px;;background-color:#FFFFFF;border-color:#F1F1F1;border-radius:5px 5px 5px 5px;border-width:1px 1px 1px 1px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="left" valign="top" class="l"><p><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.S3-S-66rObX2TUuSZjz2bizArelgWRdP0jVCah35eRkixc0ueCs4O7pNjQKulUI_SEuuiJUz0VBSApI9cuW2Nm3aeIBSTdLlTUuLq3tNl0d8l3ylRJ0XQbbZpbKmkJAvpTtRGjefeSNXGhcZPamr9A/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h15/h001.paPxZKHpZiapdrEyAVCGvJh579wZ8fKg9VbnbicFYfA" style="text-decoration:none;font-style:normal;color:#2D2D2D !important;font-size:14px;line-height:20px;" target="_blank"> Energy-Based Transformers: Outscaling Transformers and Generalizable Reasoning <tr><td align="left" valign="top" class="m"><p style="font-size:13px;line-height:19px;color:#2D2D2D;"> Learn how Energy-Based Transformers (EBTs) enable improved scalability over traditional transformers while generalizing reasoning/thinking capabilities to be learned on any problem. #AI #DeepLearning #EBMs #Transformers #reasonig #system 2 thinking </p></td></tr><tr><td align="left" valign="bottom" class="n" style="vertical-align:bottom;padding-top:12px;"><p style="word-break:break-word;">energy-based-transformers.github.io</p></td></tr></a></p></td></tr></table></td></tr></table></td></tr><tr><td id="introduction-to-energy-based-transf" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Introduction to Energy-Based Transformers</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Current AI models excel at quick, intuitive tasks but struggle with complex problems requiring deeper reasoning. This gap is known as System 2 Thinking. Currently, the AI researchers often rely on specific domains like math or coding, which need extra supervision, or can't adapt computation dynamically. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/42a98eb6-1be7-4fc1-a6df-bd06e4d0b271/ezgif.com-animated-gif-maker_text.gif?t=1752596459" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> This paper introduces energy-Based Transformers (EBTs) to tackle this by learning entirely from unsupervised data. Instead of generating answers directly, EBTs train a verifier that scores input-prediction compatibility. Predictions start random and refine iteratively via energy minimization, which enables flexible thinking across text, images, and other modalities. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/eb5dea59-6d8e-46b8-a46a-867d12802780/ezgif.com-animated-gif-maker_video.gif?t=1752596429" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:656px; padding: 4px 0px 4px 0px;"><p>Thinking Processes visualized as energy minimization for autoregressive EBTs.</p></td></tr></table></td></tr><tr><td id="inner-working-of-energy-based-trans" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Inner working of Energy-Based Transformers</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> EBTs assign an energy value to input-prediction pairs, where lower energy means higher compatibility. During training, they start with random predictions and iteratively adjust them using gradient descent to minimize energy. This process mimics human deliberation: harder problems require more refinement steps (dynamic compute), while the energy score itself signals prediction confidence (uncertainty modeling) and correctness (verification). Three regularization techniques ensure stable learning: a replay buffer stores past optimization paths, Langevin dynamics add noise to explore solutions, and randomized step sizes prevent overfitting. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/daf5add1-0570-4730-86e0-13240ece60c0/image.png?t=1752596500" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:656px; padding: 4px 0px 4px 0px;"><p>Autoregressive Architecture Comparison.</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Unlike autoregressive models that fix computation per token, EBTs adjust effort per prediction. For example, predicting common words like "the" converges quickly, while niche terms like "fox" need more steps. The architecture uses transformer blocks for parallel processing, with decoder-only (for text) and bidirectional variants (for images). Crucially, EBTs unify verification and generation in one model, avoiding adversarial training. </p></td></tr><tr><td id="evaluation-and-results-of-energy-ba" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Evaluation and results of Energy-Based Transformers</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> EBTs outperform traditional transformers and diffusion models across key benchmarks. During pretraining, they <span style="font-weight:700;"><b>scale 35% faster</b></span> in data efficiency, batch size, and model depth. In language tasks, extra computation ("thinking longer") improved perplexity by 29% more than transformers, while self-verification (choosing the best prediction) boosted gains by 10–14%. For image denoising, EBTs surpassed diffusion transformers using <span style="font-weight:700;"><b>99% fewer forward passes</b></span>. Most importantly, performance improvements were largest on out-of-distribution data, which highlights better generalization. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/1bdfd876-bcf6-40b8-917b-4495e050dfcc/image.png?t=1752596769" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> It performs well on many benchmarks but it has a few limitations including training instability from high-dimensional energy landscapes. However, using regularization techniques can mitigate this to some extent. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92Zsu28wPKZv--oZ--UZyuwKueZ_Ma8CWkklTg6NtU1SVCdcpnVgdkvgf8Re6J6mPAN_DniG8uOrj1GTuT2iuo0x/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h16/h001.G755iIBQJhZQKJW0lwK1YpHLdTWHCe2KQWJxVwaFfzs" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="small-batch-size-training-for-langu" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;mso-line-height-alt:150.0%;">Small Batch Size Training for Language Models: When Vanilla SGD Works, and Why Gradient Accumulation Is Wasteful</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style=""><i>Marek et al. [New York University, Columbia University]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 730 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Training </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> bycloud’s pick </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr class="embed-gen-img-r"><td align="center" valign="top" style="padding:12px 12px 12px 12px;" class="dd"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" class="o" style="padding:12px 12px 12px 12px;;background-color:#FFFFFF;border-color:#F1F1F1;border-radius:5px 5px 5px 5px;border-width:1px 1px 1px 1px;"><!--[if !mso]><!--><div style="display:none; float:left; overflow:hidden; width:0; max-height:0; line-height:0;" class="mob-show"><table role="none" border="0" cellspacing="0" cellpadding="0" align="right" width="100%"><tr><td align="center" valign="top"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJY6LbNwJdHoiXsHB7mlBXabGfZDPsfgAXJOnwWRCeeMQ9EZkFZAC92XrEkEcSp5FeQp3wpaDimSNw9H4iSOzeG4AH8T-_Jjisk7DoXtQKr27AGqdaAprQmYew5qLKKVrIA/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h17/h001.XMKEa7VGqETOTErz8whHhSY8Rqw-HINTCgVZAc885wk" target="_blank"><img src="https://opengraph.githubassets.com/fe0cb70538ec01dcf4b0584ce190073ec7f2110875b14c161c9d8769d32cc780/martin-marek/batch-size" width="100%" style="height:auto;display:block;"/></a></td></tr><tr><td height="16" style="font-size:16px;line-height:16px;"> </td></tr></table></div><!--<![endif]--><table role="none" border="0" cellspacing="0" cellpadding="0" align="right" width="100%"><tr><td width="57%" align="center" valign="middle" class="mob-stack"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="left" valign="middle" class="l"><p><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJY6LbNwJdHoiXsHB7mlBXabGfZDPsfgAXJOnwWRCeeMQu39UmGlgxQf-63choV70cUXvVcEDlFYqww953gc4ly-2w1S2EXbaBe_FOj6s5fQ9Z-lZnj0v1aYCcfDDm2UPvQ/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h18/h001.jd5e9zLgshOhE4bISNm7FF9Gh2FLHIf_JuEj1y6T254" style="text-decoration:none;font-style:normal;color:#2D2D2D !important;font-size:14px;line-height:20px;" target="_blank"> GitHub - martin-marek/batch-size: 📄Small Batch Size Training for Language Models <tr><td align="left" valign="top" class="m"><p style="font-size:13px;line-height:19px;color:#2D2D2D;"> 📄Small Batch Size Training for Language Models. Contribute to martin-marek/batch-size development by creating an account on GitHub. </p></td></tr><tr><td align="left" valign="bottom" class="n" style="vertical-align:bottom;padding-top:12px;"><p style="word-break:break-word;">github.com/martin-marek/batch-size</p></td></tr></a></p></td></tr></table></td><td width="3%" style="font-size:16px;line-height:16px;" class="mob-hide"> </td><td width="40%" align="left" valign="top" class="mob-hide"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.VomAAYwkCjux8i_FMc4kJY6LbNwJdHoiXsHB7mlBXabGfZDPsfgAXJOnwWRCeeMQqKJfKQuZCHEGUob6zW9S5arKP9feT2-UckgV0cPp8mC8d4gdgZJ8O_bmEmeZq6VdoU4Z3k6KcaJxEFkeJt1t8A/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h19/h001.6Yymi5OnJtJGG7QiBqsE4wqhrInKI7BDt-EUZ8Cu7Zo" target="_blank"><img src="https://opengraph.githubassets.com/fe0cb70538ec01dcf4b0584ce190073ec7f2110875b14c161c9d8769d32cc780/martin-marek/batch-size" width="242" style="height:auto;display:block;"/></a></td></tr></table></td></tr></table></td></tr><tr><td id="introduction-to-small-batch-trainin" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Introduction to Small Batch Training for Language Models</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Language model training often relies on large batch sizes for stability, and researchers frequently use techniques like gradient accumulation to simulate even larger batches. This approach requires complex optimizers like Adam and consumes significant memory. But what if we could achieve better results with tiny batches, even as small as one example at a time? </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> The researchers of this paper have overturned this conventional wisdom, by showing that small batches not only train stably but also outperform larger batches in robustness and efficiency when hyperparameters are scaled correctly. </p></td></tr><tr><td id="inner-workings-of-small-batch-optim" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Inner Workings of Small Batch Optimization</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Small batch sizes work surprisingly well because they avoid the pitfalls of large-step updates. When using large batches, optimizers must predict loss surfaces far from current parameters, requiring complex tuning. Smaller batches take gentler steps, which reduces the need for momentum or adaptive methods. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> For Adam optimizers, the biggest innovation was scaling the second-moment decay rate (β₂) based on token exposure. Instead of fixing β₂ across batch sizes, the paper proposes preserving its "half-life", the number of tokens needed to halve a gradient’s influence. This means adjusting β₂ as batch size changes: for example, when reducing batch size from 512 to 1, β₂ must increase dramatically (e.g., from 0.95 to 0.9999) to maintain consistent averaging timescales. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> Small batches also simplify optimizer choices. Without large, erratic steps, basic stochastic gradient descent (SGD), with no momentum or weight decay, becomes competitive. This eliminates optimizer state memory overhead. Similarly, Adafactor, which compresses second-moment estimates, performs well in this regime. The reduced hyperparameter sensitivity means less tuning: small batches tolerate wider learning-rate ranges and decay-rate variations, making training more accessible. </p></td></tr><tr><td id="results-and-practical-guidance" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;mso-line-height-alt:125.0%;"><span style="color:rgb(67, 67, 67);">Results and Practical Guidance</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> The researchers performed their experiments across models from 30 million to 1.3 billion parameters to confirm small batches match or exceed large-batch performance. For instance: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Robustness</b></span>: Batch size 1 maintained near-optimal loss across broad hyperparameter ranges, while larger batches degraded sharply with minor misspecifications. </p></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Performance</b></span>: On GPT-3 (1.3B), SGD at batch size 1 matched AdamW’s results at batch size 512. Adam with scaled β₂ even outperformed the baseline. </p></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Memory Efficiency</b></span>: Tiny batches enabled training 13B-parameter models on consumer GPUs using stateless SGD or Adafactor, avoiding gradient accumulation. </p></li></ul></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/daa9ac45-fae1-49b6-b042-9b8b6cddf81c/finetune_bar.png?t=1752597387" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="mso-line-height-alt:150.0%;"> These results led the researchers to draw the following conclusions: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Batch Size</b></span>: Use the smallest size that maximizes hardware throughput (typically hundreds of tokens per device). </p></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Optimizers</b></span>: Prefer SGD or Adafactor for memory-constrained settings; scale β₂ to preserve token half-life if using Adam. </p></li><li class="listItem ultext"><p style="mso-line-height-alt:150.0%;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Avoid Gradient Accumulation</b></span>: It wastes memory without benefits unless bandwidth-bound across devices. </p></li></ol></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:656px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/338f8da9-4238-4f49-9fc9-f491948e9547/adam_2d.png?t=1752597401" alt="" height="auto" width="656" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92YKBWqdTa4JpBHCFMMEz87DRlSEGxPVVZItXzV5GVu5ABF7rHDTxq5BUQ6ZvsRiIF_6KK6pk5NA2PiRlrkbM5iq/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h20/h001.v0uucUWWY1Cj6tYHkQRv9UsjPKtzbcfs5An6qef1HYw" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td class="dd" style="padding: 20px;"><table width="100%" cellpadding="0" cellspacing="0" role="none" style="max-width:520px;margin:0 auto;"><tr><td class="q" style="padding:16px 16px 6px 16px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoDDFT6eh5Nsg0xYVQj-h6I3o9m2k79_qw4izMYhmcI36J6JcYQDKmr6_LN5oiq6eNI7pzr_1bNwEqxZEGy4iU2n5oCjdeyJSdjiixv2yH2iwnbu3qQq-FcdaDB5HN7TA63xsWThvPclDoYlVvyU9cz0/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h21/h001.6kAjjTpM0i-n7mldaX13MGzMKKIDZp5iFuRybToHN2I" style="text-decoration:none !important;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="100%" style="padding: 0 0 14px 0;text-decoration:none;width:100%;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="36" style="width:36px;"><img src="https://pbs.twimg.com/profile_images/1698572487909400576/BvncwnrP_normal.jpg" alt="tw profile: The AI Timeline" style="display:block;width:36px;height:36px;border-radius:50%;border:0;"/></td><td width="400" style="padding:0 0 0 8px;text-decoration:none;"><span style="display:block;font-size:14px;color:#1c2022;font-weight:700;"> The AI Timeline </span><span style="display:block;color:#697882;font-size:14px;"> @TheAITimeline </span></td><td width="24" align="right" style="vertical-align:text-top;"><img width="24" height="24" loading="lazy" alt="tw" style="border:0;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_logo.png"/></td></tr></table></td></tr><tr></tr><tr><td style="word-break:break-word;"><p>🚨This week's top AI/ML research papers:</p><p>- Energy-Based Transformers are Scalable Learners and Thinkers <br>- Dynamic Chunking for End-to-End Hierarchical Sequence Modeling <br>- Pre-Trained Policy Discriminators are General Reward Models <br>- First Return, Entropy-Eliciting Explore <br>-</p></td></tr><tr><td style="padding:12px 0 0 0;"></td></tr><tr><td align="center" style="padding:8px 0 0 0;width:480px;"><img src="https://pbs.twimg.com/media/GvuuL5ca0AAa4cx.jpg" width="480" height="auto" style="display:block;border:1px solid #E1E8ED;border-radius:5px;width:100%;max-width:480px;height:auto;"/></td></tr><tr><td height="8" style="line-height:1px;font-size:1px;height:8px;"> </td></tr><tr><td align="left" valign="top" class="s"><p>10:13 AM • Jul 13, 2025</p></td></tr><tr><td height="10" style="line-height: 1px; font-size: 1px; height: 10px;"> </td></tr><tr><td height="1" bgcolor="#e1e8ed" style="line-height:0px;font-size:0px;height:1px;"></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td align="left" valign="top" class="s"><p><b style="color:#1C2022">1.11K</b> Likes <b style="color:#1C2022">141</b> Retweets </p></td></tr><tr><td align="left" valign="top" class="s"><div align="center" style="text-align:center;margin-top:4px;margin-bottom:4px;padding:8px;border:1px solid #ccd6dd;border-radius:9999px;color:#1B95E0"><b>7 Replies</b></div></td></tr></table></a></td></tr></table></td></tr></table></td></tr></table></td></tr><tr><td align="center" valign="top"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><tr><td class="b" align="center" valign="top" bgcolor="#2a2a2a" style="padding:0px 0px 0px 0px;border-style:solid;border-width: 0px 0px 0px 0px;border-color: #2a2a2a;border-bottom-left-radius:10px;border-bottom-right-radius:10px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" bgcolor="#73ddff" style="padding:12px"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><span style="padding-left:1px;"></span></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCQqcWoV4NNHHr5SkP9THApWuHAAlWLQxI3Q_IqFmt_DcyAxeC8jDApCnHmMSBGpBb5sgtimvBYgxRX-Rp7s0F3LjCHoSwdhr83OBqRFhJ1y_/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h22/h001.fW3hoWUg0Hk5NR8YUJJkD-J8Im4HhHpnmtfWQ_7wnYE" style="text-decoration:none;"><img width="22" height="22" alt="tw" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_dark.png"/></a></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmBoQnQ9VXnB2zTxBG4HeHBgjMqVxpoXRdj01cjwyoVlHgiebEOgBvwHtevoVpsSvpn3Q1di2ml6sb3cBM-X6IStQbj_zQSVGWJ8AAmPw2en2/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h23/h001.TIyuuCFDrL_GQ6motn_sv_lDWTN0rjL669DtPrJ3TvU" style="text-decoration:none;"><img width="22" height="16" alt="yt" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/youtube_dark.png"/></a></td><td><span style="padding-left:1px;"></span></td></tr></table></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td class="w" align="center" valign="top" style="padding:15px 15px 15px 15px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> Update your email preferences or unsubscribe <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsBhEpz-DJgyVFmavJPa0OyKRRnvw4o7XGyvIv7PRofnmN-VYMI_S1MVXdZkEcOjnQnUY5bieJHTWm06rVIsqp4ij-yR-U3AWiTf-ak57zDoWGsBImLdEdBQUbyxGoVB3nn7yyPjyo82juOVXEqOKG5QnhcUxjIxCDFAMP0DDAaqezM07ggAl1Bm55Ek1DS6UZ9VJint_DeuuKWrXYiVFgmpqtgbGDTqcPNghPzyw4JDKD5s9lcbOwFWF0eLUODXWiT_gR2_hOIdk1B2jqZvPmIgLP3M1ujabEoJFdd3yFOuE_P3dhUTXduTjDcuXqxzxDlmRH4F0ezbwhbZPmlFdJ_sO0fsAr1TBaNgBBzC60dUHHhQLL8lqj_FRQByJUdbQAaqBVWO4vFWURI9ld07d8V6kd-ab-YxMmnwBBcTUEDjJplCES3iGSQ1gGywtdAmUwyqa-JfLOTcQ5OsDmeOf8PU8UXO8m6-4YCwOKHOBJkcu2Gg-6j6GqFA7F9oGizhgEzZLsnucm4U2TbLHXXZY7Zo_PtZITfMnfYJvkit0q2vaxe-CQ4vnbUSftCY2wpCd0dl5T_CpUpahCjaGOr1fyTqhe1xnntoh76uZMZigaxvGe8OTncHwL9lN0TFhJ3QtXuSgKsvZF_RbsTdneZ9X9C7esW8hxooofKvX_MOwPRtmch40CfEA4fwRM0RmHzjUABwgVfNn0YZnS-QaI7rD75hi9GME0GoP97T8GyOXMf00qSbib1Gn8-bxBL9JCEMStw/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h24/h001.VP5dS8IRYokHl9TgNdvoE62xJgxhCL6ruVwPJVIm2oA" style="text-decoration:underline;text-decoration-color:#FFFFFF!important;color:#FFFFFF!important;"> here</a></p><p class="copyright" style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> © 2025 bycloudai </p><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> 228 Park Ave S, #29976, New York, New York 10003, United States </p></td></tr><tr style="display: table-row !important;"><td align="center" valign="top" style="padding-top:20px;" style="display:table-cell !important;"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="display:table !important;"><tr style="display:table-row !important;"><td class="u" align="center" valign="middle" height="32" style="height:32px;display:table-cell !important; max-height: 32px !important;margin:0px !important; background-color: #ffffff !important;"><a style="line-height:32px !important;text-decoration:none;display:block !important;" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28olDWFpV5DDKfdk_OdOKOgLpZrmeFF5zJTo1ewPMDhABLdRYvbBDacRrWQ0Q6G1FOOY_qK5Xn6kh-FUh2nxyEJcLzO9yBfdpBmJNdwdaQMef_55yoPOe4dIZDL3J_1U3sslGPCddh0j7sFrhp9VlMTvhehBdjd-wnJKEN2_ZHJs-l8XgGXtUgZv6di_b0UhYk549Fw_YDgceYvpDbgA9mt2CDQEEm59rfUvlLvOGH2U/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h25/h001.r8DflNWxxSzR2_jRcXHYiVvfyau2Cnvj6tn9S63Md24"><img src="https://media.beehiiv.com/output-onlinepngtools.png" width="16" alt="beehiiv logo" style="display:inline-block !important;max-width:16px !important; vertical-align:-3px !important;width: 16px !important;" border="0"/><span style="padding-left:11px !important;display: inline-block !important;">Powered by beehiiv</span></a></td></tr></table></td></tr><tr><td align="left" valign="top" height="2" style="height:2px;"><a href='https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWsHIaP4XNp0WgUYqLvHcKk_3uqk_KIkz4ddLinhFbud6JuxLFdSUhYnR7b1NSsmbtzXNGNblnEEMKUtkCAjkn8Y/4i8/qagMnS_rQxW_F_M9Wl6Jrw/h26/h001.ceNayfNIloJACV74c2owS_Gw0aAAq-jVMEXWYltihPk' style="color: #2a2a2a !important; cursor: default; font-size: 1px; text-decoration: none;"> Terms of Service </a></td></tr></table></td></tr></table></td></tr></td></tr></table></td></tr></table></td></tr></table></td></tr></table></div></body></html>