<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" style="font-size:16px;"><head></head><head><meta charset="utf-8"/><!--[if !mso]><!--><meta http-equiv="X-UA-Compatible" content="IE=edge"/><!--<![endif]--><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="x-apple-disable-message-reformatting"/><meta name="format-detection" content="telephone=no,address=no,email=no,date=no,url=no"/><meta name="color-scheme" content="light"/><meta name="supported-color-schemes" content="light"/><title>PokéChamp: an Expert-level Minimax Language Agent</title><!--[if mso]><xml><o:OfficeDocumentSettings><o:AllowPNG/><o:PixelsPerInch>96</o:PixelsPerInch></o:OfficeDocumentSettings></xml><![endif]--><style> :root { color-scheme: light; supported-color-schemes: light; } body { margin: 0; padding: 0; min-width: 100%!important; -ms-text-size-adjust: 100% !important; -webkit-transform: scale(1) !important; -webkit-text-size-adjust: 100% !important; -webkit-font-smoothing: antialiased !important; } .body { word-wrap: normal; word-spacing:normal; } table.mso { width: 100%; border-collapse: collapse; padding: 0; table-layout: fixed; } img { border: 0; outline: none; } table { mso-table-lspace: 0px; mso-table-rspace: 0px; } td, a, span { mso-line-height-rule: exactly; } #root [x-apple-data-detectors=true], a[x-apple-data-detectors=true], #MessageViewBody a { color: inherit !important; text-decoration: inherit !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important; } span.MsoHyperlink { color: inherit !important; mso-style-priority: 99 !important; } span.MsoHyperlinkFollowed { color: inherit !important; mso-style-priority: 99 !important; } .a { background-color:#dedede; } .b { background-color:#2a2a2a; } .c { background-color:#ffffff; } .d { background-color:#fff0c8; } .d2 { background-color:#FFFFFF; } .d3 { background-color:#FFFFFF; } h1 a { text-decoration:none;color:#2A2A2A !important; } h2 a { text-decoration:none;color:#2A2A2A !important; } h3 a { text-decoration:none;color:#2A2A2A !important; } h4 a { text-decoration:none;color:#2A2A2A !important; } h5 a { text-decoration:none;color:#2A2A2A !important; } h6 a { text-decoration:none;color:#2A2A2A !important; } h1, h1 a, h2, h2 a, h3, h3 a, h4, h4 a, h5, h5 a, h6, h6 a, ul, li, ol, p, p a { margin: 0;padding: 0; } h1 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:28px;color:#2A2A2A;line-height:42px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h2 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:24px;color:#2A2A2A;line-height:36px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h3 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:20px;color:#2A2A2A;line-height:30px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h4 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:18px;color:#2A2A2A;line-height:27px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h5 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:16px;color:#2A2A2A;line-height:24px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h6 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:14px;color:#2A2A2A;line-height:21px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } p { font-family:'Georgia','Times New Roman',serif;font-weight:400;color:#2D2D2D;font-size:16px;line-height:24px;padding-bottom:8px;padding-top:8px;mso-margin-top-alt:8px;mso-margin-bottom-alt:8px; } p a, .e a, ul a, li a, .h a, .h2 a, .h3 a { word-break:break-word;color:#2C81E5 !important;text-decoration:none;font-style:italic; } p a span, .e a span, ul a span, li a span { color: inherit } p .bold { font-weight:bold;color:#2D2D2D; } p span[style*="font-size"] { line-height: 1.6; } .f p { font-size:12px;line-height:15px;color:#2D2D2D;padding:0; } .f p a { color:#2D2D2D !important; } .g p { font-family:'Helvetica',Arial,sans-serif;font-size:14px;line-height:20px;font-weight:normal;margin:0; } .g p a { text-decoration: underline; } .i p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i p a { color:#2D2D2D !important; } .i2 p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i2 p a { color:#2D2D2D !important; } .i3 p { font-family:'Helvetica',Arial,sans-serif;line-height:43px;font-size:24px;color:#2D2D2D; } .i3 p a { color:#2D2D2D !important; } .h p a { color:#595959 !important; } .h2 p a { color:#595959 !important; } .h3 p a { color:#595959 !important; } .f p a, .i p a, .i2 p a, .i3 p a, .h p a, .h2 p a, .h3 p a { text-decoration:underline; } .j { border-top:3px solid #ffeb2d; } .k p { padding-left:15px;padding-bottom:0px;padding-top:6px;mso-margin-top-alt:6px;mso-margin-bottom-alt:0px;mso-margin-left-alt:15px; } .o { background-color:#FFFFFF;border:1px solid #F1F1F1;border-radius:5px; } .o p { font-family:'Helvetica',Arial,sans-serif;padding:0px;margin:0px; } .l p, .l p a { font-size:14px;line-height:20px;font-weight: bold;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .m p, .m p a { font-size:13px;line-height:18px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .n p, .n p a { font-size:12px;line-height:17px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .p { background-color:#FFFFFF;max-width:520px;border:1px solid #E1E8ED;border:1px solid rgba(80, 80, 80, 0.3);border-radius:5px; } .q { font-size:16px;font-family:Helvetica,Roboto,Calibri,sans-serif !important;border:1px solid #e1e8ed;border:1px solid rgba(80, 80, 80, 0.3);border-radius:10px;background-color:#FFFFFF; } .q p { font-size:16px;font-family:system-ui,Helvetica,Roboto,Calibri,sans-serif !important;color:#222222;padding:4px 0; } .r { border:1px solid #E1E8ED !important;border-radius:5px; } .s p { font-size: 14px; line-height: 17px; font-weight: 400; color: #697882; text-decoration: none; } .t p { font-family:'Helvetica',Arial,sans-serif;font-size:12px;line-height:18px;font-weight:400;color:#000000;font-style:italic;padding:4px 0px 0px;} .v { border-radius:10px;border:solid 0px #DFD150;background-color:#2C81E5;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;color:#FFFFFF; } .v a { text-decoration:none;display:block;color:#FFFFFF; } .w p { font-size:12px;line-height:15px;font-weight:400;color:#FFFFFF; } .w p a { text-decoration: underline !important;color:#FFFFFF !important; } ul { font-family:'Helvetica',Arial,sans-serif;margin:0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:disc;font-size:16px; } ul > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:disc; } ol { font-family:'Helvetica',Arial,sans-serif;margin: 0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:decimal;font-size:16px; } ol > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:decimal; } .e h3, .e p, .e span { padding-bottom:0px;padding-top:0px;mso-margin-top-alt:0px;mso-margin-bottom-alt:0px; } .e span, .e li { font-family:'Helvetica',Arial,sans-serif;font-size:16px;color:#2D2D2D;line-height:24px; } .rec { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji" !important; } .rec__button:hover { background-color: #f9fafb !important; } .copyright a {color: inherit !important; text-decoration: none !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important;} .txt_social p { padding: 0; word-break: break-all; } .table, .table-c, .table-h { border: 1px solid #C0C0C0; } .table-c { padding:5px; background-color:#FFFFFF; } .table-c p { color: #2D2D2D; font-family:'Helvetica',Arial,sans-serif !important;overflow-wrap: break-word; } .table-h { padding:5px; background-color:#F1F1F1; } .table-h p { color: #2A2A2A; font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif !important;overflow-wrap: break-word; } @media only screen and (max-width:667px) { .aa { width: 100% !important; } .bb img { width: 100% !important; height: auto !important; max-width: none !important; } .cc { padding: 0px 8px !important; } .ee { padding-top:10px !important;padding-bottom:10px !important; } .ff ul, .ff ol { margin: 0px 0px 0px 10px !important;padding: 0px !important; } .ff li { margin:10px 0px 0px 10px !important; } .r {height:140px !important;} .s p { font-size:13px !important;line-height:15px !important; } .mob-hide {display:none !important;} .mob-stack {display:block !important;width:100% !important;} .mob-w-full {width:100% !important;} .mob-block {display:block !important;} .embed-img {padding:0px 0px 12px 0px !important;} .socialShare {padding-top:15px !important;} .rec { padding-left:15px!important;padding-right:15px!important; } .bodyWrapper { padding:7px 4px 7px 4px !important; } .social-mobile {float:left !important;margin-top:10px !important;} } @media screen and (max-width: 480px) { u + .a .gg { width: 100% !important; width: 100vw !important; } .tok-heart { padding-top:75% !important; } .tok-play { padding-top: 250px !important; } } @media screen and (max-width: 320px) { .tok-heart { padding-top:65% !important; } } .u { border: 1px solid #CACACA !important; border-radius: 2px !important; background-color: #ffffff !important; padding: 0px 13px 0px 13px !important; font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif !important;font-size: 12px !important; color: #767676 !important; } .u a { text-decoration: none; display: block !important; color: #767676 !important; margin: 0px !important; } .u span, .u img { color: #767676 !important;margin:0px !important; max-height:32px !important;background-color:#ffffff !important; } </style><!--[if mso]><style type="text/css"> sup { font-size: 100% !important;vertical-align: .5em !important;mso-text-raise: -1.5% !important;line-height: 0 !important; } ul { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ul li { margin-left: 0px !important; mso-special-format: decimal; } ol { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ol li { margin-left: 0px !important; mso-special-format: decimal; } li.listItem { margin-left:15px !important; margin-top:0px !important; } .paddingDesktop { padding: 10px 0 !important; } .edm_outlooklist { margin-left: -20px !important; } .embedImage { display:none !important; } </style><![endif]--><style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-display: swap; src: url('https://fonts.gstatic.com/s/opensans/v40/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4uaVIUwaEQbjB_mQ.woff2') format('woff2'); } @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 700; font-display: swap; src: url('https://fonts.gstatic.com/s/opensans/v40/memQYaGs126MiZpBA-UFUIcVXSCEkx2cmqvXlWq8tWZ0Pw86hd0RkyFjWV4ewIMUdjFXmSU_.woff2') format('woff2'); } </style></head><body class="a" style="margin:0px auto;padding:0px;word-wrap:normal;word-spacing:normal;background-color:#dedede;"><div role="article" aria-roledescription="email" aria-label="email_name" lang="en" style="font-size:1rem"><div style="display:none;max-height:0px;overflow:hidden;"> Plus more about Optimal Hyperparameter Scaling Law in Large Language Model Pretraining and (How) Do Language Models Track State?  ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ </div><table role="none" width="100%" border="0" cellspacing="0" align="center" cellpadding="0" class="gg"><tr><td align="center" valign="top"><table role="none" width="670" border="0" cellspacing="0" cellpadding="0" class="aa" style="width:670px;table-layout:fixed;"><tr><td class="bodyWrapper" align="center" valign="top" style="padding:7px 7px 7px 7px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="border:4px solid #2a2a2a;border-radius:0px;background-color:#ffffff;" class="c"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td class="f" align="right" valign="top" style="padding:20px 28px;"><p> March 13, 2025 | <a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxSdB5RCIH6yy1Fm1CYma3ExfSMxBacmr6gVCoNxgFjs8a879oDmfJ_1B4AZky64-sQpXXqQq6T1W8YIAbCjvY8E6RcqP9KxKouK183RuVCYNohAt1l-q8x2S8zmHjQ7YmCjDxiPfsvpoRSTNbMl1-2ZnzcBgcfaN05yIE7V3dElSOh0tf3ASWN0bhMTlclIir52dajImn4NaD_bQGBO5979ZQyEGqdekseSKnpPC2vxQCrlfKIUdmyP6G6ZRF5zXwFtiRHsoG0-b8J0BEZ6iKOuqBGZ8a3SE-XB2jJxBISFpUhRFd3sEuMy4AtTPVNR85XVnOEbWZj0kQPKF3cTk3QO-gcpYoOjUTUC91le8cefjS2HTUCdc7r6CxEmKI8_cUKITl1hThGJ1Grs--BqPQdLfXFsA33W2IseMAS7SBaP94WJQTnS1YOvZRIJrNef4OaX4TKDJ_uU5xNEBoWsvCWXNI1P_AVuxq0UbWy6GVqPAmp8TyCJap8uMlLwqOoMy00Y7S-8pojyxmJSi1cTg5-uNL4EY-5kJlKh6NE3SmHVzvtjwSx4daM28OTn2reZAwKEMdEodIEDzKZUMY1ZBpGAbU0ESJSgnU9IUqZIXeFmJvKJuN3Cf5U2Bla3kQn-3qZmuqpgdZtyEpFdXRxASPvHu9DATakc6fH-tOFRKSll8/4er/qExZERPuR5CCoMME6COllg/h0/h001.8Dl6yjsA_iZdezjDMJvEIfJTmX9u47Jh62IasXFmFiU">Read Online</a></p></td></tr><tr><td class="dd" align="center" valign="top" style="padding:15px 28px 20px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><h1 style="text-align:left;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;font-weight:Bold;font-size:32px;color:#2A2A2A;padding:2px 0;line-height:38px;"> PokéChamp: an Expert-level Minimax Language Agent </h1><p style="text-align:left;font-family:'Helvetica',Arial,sans-serif;font-weight:normal;font-size:20px;color:#3E3E3E;padding:5px 0;line-height:24px;"> Plus more about Optimal Hyperparameter Scaling Law in Large Language Model Pretraining and (How) Do Language Models Track State? </p></td></tr></table></td></tr><tr><td style="height:0px;width:0px;"><div style="height:1px;" data-open-tracking="true"> <img src="https://elink4f7.mail.bycloud.ai/ss/o/u001.3wmUuY8gEWd4_869a_eXcg/4er/qExZERPuR5CCoMME6COllg/ho.gif" alt="" width="1" height="1" border="0" style="height:1px !important;width:1px !important;border-width:0 !important;margin-top:0 !important;margin-bottom:0 !important;margin-right:0 !important;margin-left:0 !important;padding-top:0 !important;padding-bottom:0 !important;padding-right:0 !important;padding-left:0 !important;"/> </div></td></tr><tr id="content-blocks"><td class="email-card-body" align="center" valign="top" style="padding-bottom:28px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td id="nov-18-th-nov-24-th-33-latest-ai-re" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h6 style="color:#2A2A2A;font-weight:normal;"><i>Mar 3rd ~ Mar 9th</i><br><i>#46 Latest AI Research Explained Simply</i></h6></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="industry-news-in-1-line" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">🗞️ Industry News in 1 Line</h2></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 2.2k</span></span> OpenAI has introduced several <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DIqyAo9xTeoWriogq2VlWeUmi9WmFR4pnC4wMSHAHOG2R-EY2Nxb3rebH-4VIMOE1C3o6hnx5cmiUkG8j9hk5MWCBkZta5Z1OfK-uzzgMvOVlJYbhXQXzic3SrMZPYL9V3IniQue_Rtva1Obe3PfpggheeaS9YFxyXXZzjY0svBqFTYPnME_5sv5TnG7CPL2/4er/qExZERPuR5CCoMME6COllg/h1/h001.1oKj8QOCbM8uKrIUBmxZRxd28jxlmR3PeTRdwUyidBc" target="_blank" rel="noopener noreferrer nofollow"><span>new tools and SDKs</span></a> to enhance AI agent capabilities. The <b>Web Search tool</b> allows agents to retrieve up-to-date information from the web, while the <b>File Search tool</b> enables precise information retrieval from large document collections. Additionally, the Computer Use tool, powered by the CUA model, allows agents to perform tasks on computers, and the new Agents SDK facilitates the orchestration of multi-agent workflows. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:480px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/924e3ff7-defb-4fe0-8b0b-be676495337b/_528210AC-C1EE-4929-A252-BC6B026F29C2_.png?t=1741792075" alt="" height="auto" width="480" style="display:block;width:100%;" border="0"/></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 11k</span></span> Mistral AI has launched <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxU6myLBNr0kxu76gma1gUlY5slDLx28SAcQJlWPyaL04U2eFqJleGVVspoHMl8vYQqPjimMdVdhlmEf1ChSfuAyKKVuQDrLOVAcV14Fix8FWCdBGqiNmhOkPmttR92wgLQ/4er/qExZERPuR5CCoMME6COllg/h2/h001.jZq-9vdIqmP0u0i24od5QATZ98nBmoLfWANZQHZk9lQ" target="_blank" rel="noopener noreferrer nofollow"><span>Mistral OCR</span></a>, an Optical Character Recognition API that can understand media, text, tables, and equations within complex documents. It has the ability to process images and PDFs and extract content in an ordered interleaved format. This makes it ideal for use with RAG systems handling multimodal documents. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:480px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/7b19641f-42cf-4565-8a0c-dab98f109efe/_DFF60C2D-D7ED-47E5-B68B-25907A776454_.png?t=1741792281" alt="" height="auto" width="480" style="display:block;width:100%;" border="0"/></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 8.9k</span></span> The Qwen Team has introduced <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.wcXdj6dB6nd1Cx4inzJNk_td6EHXs7wvRPySUkyhYFfG0F_jLUnHEwgw-tRCu0bStUleRuOh43ZycedQ23JKht3h7Tw9lZ4CKegH6oilMaxBmojk3MNPmOUPMjVgyF8vFODHzbRKb09A6KrhrZTMlewQgkgx8Pn2Wp-1StGzB8w/4er/qExZERPuR5CCoMME6COllg/h3/h001.Qh09hsabT-WSb_rVCXFYRJjvKG5B8A_yg3rekge3bi0" target="_blank" rel="noopener noreferrer nofollow"><span>QwQ-32B</span></a>, a 32 billion parameter model that leverages the power of Reinforcement Learning to achieve performance comparable to the much larger DeepSeek-R1 model. QwQ-32B demonstrates significant improvements in mathematical reasoning, coding proficiency, and general problem-solving capabilities, thanks to a multi-stage RL approach that begins with cold-start data and progresses through specialized training for math and coding, followed by general capability enhancement. Its weights have been released under the <b>Apache 2.0 license</b> on Hugging Face and ModelScope, and accessible via Qwen Chat. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 3k</span></span> Google DeepMind has launched the <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.oB7zuO_W-X4Toa45C28ngyEFO5Gj_UpIFzWcxERDRtBJ-eCgpfLld0p0ovem1xfJ6ND9rubwYEdLqQka50ILHcRyoLY5MppOKbw2Xv70izJNgSn9_LBFasU44quYdNZMfToezMoyfi2ZAPemx0WPX0pDManBhrD8Kjco_ueetN8/4er/qExZERPuR5CCoMME6COllg/h4/h001.RpFD5sAQ9diazL-S93rSfhEgND48IPIY2S9UH3FaS-8" target="_blank" rel="noopener noreferrer nofollow"><span>Gemma 3</span></a> family of open models, featuring variants with 1 billion, 4 billion, 12 billion, and 27 billion parameters. The models incorporate vision input capabilities with a 400 million parameter SigLIP model and support a context length of 128k. Notably, the 27B parameter model ranks 9th on LMArena, <b>surpassing models like o3-mini, DeepSeek V3, Claude 3.7 Sonnet, and Qwen2.5-Max</b>. </p></li></ol></div></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="transparent" style="background-color:transparent;border-color:#2C81E5;border-style:solid;border-width:5px;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;"><span style="">RTX 4080 SUPER Giveaway With NVIDIA’s GTC 2025</span></h2></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1r-Oo1v2APbJapoJieKk3V2DO7NkBsza57dliEOwa6PrCHfGqwkzFzXNcPKf-_zR2eofoRDJVIr2uWPCL0O09M38ctdoxt0-4uft8xTqPF4A2L5bfiFSy8hNnpelPJkW/4er/qExZERPuR5CCoMME6COllg/h5/h001.gcFLWXlR89xB0rVmBEEm-YO6ynpA3NxpzLlNWkpZ1AQ" rel="noopener noreferrer nofollow" style="text-decoration:none;" target="_blank"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/6b4027f1-7cb0-483c-bc62-f7c3e56d1d36/geforce-rtx-4080-super-og-1200x630.jpg?t=1739758485" alt="RTX4080 SUPER Giveaway" height="auto" width="600" style="display:block;width:100%;border-radius:0px 0px 0px 0px;border-style:solid;border-width:0px 0px 0px 0px;box-sizing:border-box;border-color:#E5E7EB;" border="0"/></a></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p><span style="">RTX 4080 SUPER Giveaway!</span></p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">During NVIDIA’s GTC event which is NVIDIA’s annual flagship AI & developer Conference, March 17-21, 2025, there will be various big announcements, events, and sessions you can </span><span style=""><b>attend both in-person or virtually</b></span><span style="">. </span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">This is one of the best times to learn from global experts on how generative AI is impacting industries and society as a whole.</span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">You can </span><span style=""><b>attend virtually </b></span><span style="">to sessions like:</span></p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="list-style-type:disc;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">How to Build an Agentic AI Blueprint Using the Best Tools and Frameworks hosted by the director of engineering from NVIDIA </span></p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Accelerate Inference on NVIDIA GPUs hosted by the CTO of Together AI</span></p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">So you can virtually discover the latest breakthroughs in generative AI and NVIDIA technologies from subject matter experts at </span><span style=""><b>#GTC25</b></span></p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.sa7HIrCkEzyny8lstY55mOGXqey6XwqgGMAgXuOux3xG-dEbFgoBohMjdiScpbE6OarznDTReYOnQFUI9Bc1j38TyGFk4-c7VfnSuhABE8Q/4er/qExZERPuR5CCoMME6COllg/h6/h001.0CkWIAYuEt8yL2F-l2r7HXdIKsB2imcAcTv49skgpls" rel="noopener noreferrer nofollow" style="text-decoration:none;" target="_blank"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/3ed23905-cca4-45e7-9d22-597420c6671a/Screenshot_2025-02-16_210612.png?t=1739757983" alt="Highlighted Technical Speakers List from GTC2025" height="auto" width="600" style="display:block;width:100%;border-radius:0px 0px 0px 0px;border-style:solid;border-width:0px 0px 0px 0px;box-sizing:border-box;border-color:#E5E7EB;" border="0"/></a></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p><span style="">Highlighted Technical Speakers List from GTC2025</span></p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">By virtually attending these sessions, you can </span><span style=""><b>join my giveaway for an RTX4080 SUPER</b></span><span style="">. All you have to do is to take a selfie of yourself attending the LIVE virtual sessions that are available during GTC (March 17-21), submit it using </span><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.9ggl6Mt0xphuuMReR5gVpdhff8iVAkayl-q5vnwXNgVweea5z7E-FAjaLRloITHq8yyEUjt04McVD-wRtPzdBiM1tfVTT0WaI0pZkebK9Gwmmt6M0mcypod2k8OqurmIy8X8Gjtp9CgU3FqMla57lG88BbsqIbOWrVWNcK80lWc_x3ztksmyFHrD1Lx1KLG5W_ek9F96B5Uerrm3BHgyuw/4er/qExZERPuR5CCoMME6COllg/h7/h001.FDiwpuXFw74DzgrdLe1wdxR6_8KDukvrO1XKeQ15cz0" target="_blank" rel="noopener noreferrer nofollow"><span>this Google Form</span></a></span><span style="">, and you can learn and possibly earn a GPU at the same time! You can find more information on the google form.</span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">And of course, the yearly keynote by the NVIDIA CEO Jensen Huang, will be happening on Tuesday </span><span style=""><b>March 18th at 10am Pacific Time</b></span><span style="">. So don’t miss that out!</span></p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style=""><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-radius:10px;border:solid 0px #DFD150;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.sa7HIrCkEzyny8lstY55mOGXqey6XwqgGMAgXuOux3yu6qrZISwQvLbW-sBEWRxxbJysVXf9yVxXsRD_FkUkvU7_scgDqhPuFfdBQW-fkUg/4er/qExZERPuR5CCoMME6COllg/h8/h001.ipVRAMwCVFw6PvoErF9cKurjgWDiHDoV7nhhr2_Qt1Q" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;padding:0px 14px;text-decoration:none;"> Check Out GTC 2025! </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoGymQ3NNPtd5dE5MV_8UgjIDFPVXngz8pvQBldSW42yhUe_Qiq6DgEPMEBuPL9yfRpXelTiuu2kS8pLFvsoem_XoZoy_n13sTKUhZIbl0VH6/4er/qExZERPuR5CCoMME6COllg/h9/h001.g-IFzt-W-6OPcDspX1Lqt5kD4UqRimcPmOZeT32Maak" target="_blank" rel="noopener noreferrer nofollow"><span>Advertise with The AI Timeline! </span></a></span></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="pok-champ-an-expertlevel-minimax-la" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">PokéChamp: an Expert-level Minimax Language Agent</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Karten et al. [Princeton University]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 1.1k </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Agent </span></span></p></td></tr><tr><td id="introduction-to-pok-champ" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to PokéChamp</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The challenge of creating agents that can effectively compete in complex, partially observable environments like Pokémon battles has been significant. Traditional reinforcement learning approaches often require extensive task-specific training, which can be resource-intensive and less adaptable to new scenarios. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This paper introduces PokéChamp, a novel agent that leverages the generalist capabilities of LLMs to enhance minimax tree search in Pokémon battles. By integrating LLMs into three key modules, player action sampling, opponent modeling, and value function estimation, PokéChamp effectively utilizes gameplay history and human knowledge to reduce the search space and address partial observability. This approach <span style="font-weight:700;"><b>does not require additional LLM training</b></span>, making it highly flexible and efficient. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/bbafdd05-72b7-40c8-ba23-e64138e55dde/_40FAD0F7-E553-4B7B-AD51-61BC6012496F_.png?t=1741710616" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="how-does-pok-champ-work" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">How Does PokéChamp Work?</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The PokéChamp has three key modules: approximate game transition, player action sampling, and opponent modeling. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> First, we tackle the challenge of <span style="font-weight:700;"><b>simulating game transitions</b></span> under partial observability. We use statistical data from Pokémon Showdown, including move pools, EV spreads, and item usage, to infer hidden information and approximate the latent state. We also incorporate LLM predictions to estimate hidden opponent variables, such as attack and defense stats, based on game history. After predicting the current state, we simulate the next state using our local Showdown simulator. To manage the computational burden, we simplify the approach by computing expected values within these transitions. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/54fff45d-b6d0-4fc6-81d3-ba751b362586/_35D80044-351F-4DF2-9FDF-C31AC04A3262_.png?t=1741710657" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Next, for <span style="font-weight:700;"><b>player action sampling</b></span>, PokéChamp generates a set of candidate actions for the minimax search tree. The input prompt for action sampling includes the team strategy, observable state, battle history, approximate state transition heuristic, and available actions. In addition to LLM-generated actions, we include candidate actions from our tools, such as the top move choice from our one-step lookahead and the top switch choice from the Abyssal bot. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> For <span style="font-weight:700;"><b>opponent modeling</b></span>, we address the partial observability of the opponent's actions and hidden state information. We use historical data to estimate unknown opponent stats and employ LLM-based predictions to generate likely opponent actions based on a prompt similar to the action sampling process but focused on the opponent's perspective. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/bf5691aa-63c4-4737-b854-4b903e28127c/_2888684C-6EC0-44CA-B38A-910E4BA3CA11_.png?t=1741710692" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>PokéChamp replaces components of minimax tree search with LLM-based generations</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Finally, due to the computational constraints of live gameplay, we use an LLM-generated value function to evaluate leaf nodes in our minimax tree. The LLM generates a score based on positive factors like the effectiveness of current moves and the number of remaining Pokémon, and negative factors like excessive switching and the opponent's move effectiveness. By combining these three components, action sampling, opponent modeling, and value function approximation, PokéChamp effectively navigates the complex, partially observable state space of Pokémon battles, approximating optimal play within the constraints of real-time gameplay. </p></td></tr><tr><td id="results-and-real-world-implications" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Results and Real-World Implications of PokéChamp</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The evaluation of PokéChamp was conducted using both an offline dataset and online games on the Pokémon Showdown platform. The compiled dataset, consisting of over 3 million battles, provided a foundation for analyzing transition probabilities and opponent policies, with detailed information on team compositions, move choices, and battle outcomes. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/71fb9dba-89de-4d51-b3e5-7cece81f47c1/_6E178F3D-DFCC-4EE0-B4B1-C3C786995CC4_.png?t=1741710791" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Action prediction experiments showed that PokéChamp achieved a player action prediction accuracy of 26-30% and an opponent action prediction accuracy of 13-16% across various Elo ratings, significantly outperforming random prediction baselines. In small game puzzles designed to test core game mechanics, PokéChamp demonstrated an <span style="font-weight:700;"><b>86% win rate</b></span> in 1v1 battles, surpassing PokéLLMon's 76%, and effectively utilized special mechanics like Terastallization and Dynamax to gain strategic advantages. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/f88e9257-5360-4120-8040-d0c7e733540b/_C748C54E-8132-4E18-A65E-2C46E7058EA3_.png?t=1741710817" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Against human players on the online ladder, PokéChamp achieved a 76% win rate within the time constraints, reaching an Elo score above 1300 and placing it in the top 30%-10% of players. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style=""><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-radius:10px;border:solid 0px #DFD150;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92aFYDLV1Pm_D9GX2BG6KN9qPQe_6q2XffoKwmE8br1jNUyFtDsOpuAS6vfC_P_RfZBDU2rp_nDZpX4e0xA6Kezt/4er/qExZERPuR5CCoMME6COllg/h10/h001.8Khdp3OQyf59G7hdZxR8-Or2-aBymzKHfpg4-zajKXM" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;padding:0px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="predictable-scale-part-i-optimal-hy" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Predictable Scale: Part I -- Optimal Hyperparameter Scaling Law in Large Language Model Pretraining</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><i>Li et al. [StepFun, Fudan University, Tsinghua University, Megvii Technology]</i></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 136 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Pre-training </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="introduction-to-hyperparameter-opti" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to Hyperparameter Optimization</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The paper addresses the challenge of hyperparameter optimization for LLMs, which is computationally expensive at scale. Through extensive studies on 3,700 LLMs and nearly one million GPU hours, the researchers discovered universal scaling laws governing optimal hyperparameters. Their proposed "Step Law" establishes that optimal learning rates follow a power-law relationship with both model parameters and data sizes, while optimal batch sizes scale primarily with data sizes. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This formula achieves results within 0.09% of globally optimal performance without exhaustive searches. Notably, these scaling laws demonstrate <span style="font-weight:700;"><b>robustness across various model architectures</b></span>, including both dense transformers and Mixture-of-Experts models, as well as different data distributions, providing a plug-and-play tool for efficient hyperparameter selection in LLM training. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/4baefeb4-c10c-4b95-a988-77cf8d24f124/_1AF82669-9779-4725-9155-08EDCD0AF16E_.png?t=1741710872" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>Comparison of optimal hyperparameter scaling laws across different approaches.</p></td></tr></table></td></tr><tr><td id="core-mechanism-of-llm-hyperparamete" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Core Mechanism of LLM Hyperparameter Optimization</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The Step Law presents a universal hyperparameter optimization framework for Large Language Models based on extensive empirical research across 3,700 model configurations. Its central innovation lies in two power-law scaling formulas that predict optimal hyperparameters: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Learning Rate Formula</b></span>: </p><ul style="list-style-type:disc;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Scales inversely with model size (N) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Scales positively with data size (D) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Captures the complex interplay between model capacity and training data volume </p></li></ul><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/5368d285-51e0-4091-812e-d466dcd148cd/_9058F737-CDFC-4262-9F1B-99CF5A1D918D_.png?t=1741710975" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Batch Size Formula</b></span>: </p><ul style="list-style-type:disc;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Primarily depends on dataset size (D) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Remains largely invariant to model parameters </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Follows a sublinear scaling pattern with data volume </p></li></ul><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/3edb8ff0-eb98-4266-8d5e-845115a2064e/_9E11640D-F440-4E04-A59F-38EC6AF0DE91_.png?t=1741711105" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></li></ol></div></td></tr><tr><td id="technical-architecture-implementati" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Technical Architecture & Implementation</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The system's architecture rests on four key technical discoveries: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Convex Loss Landscape</b></span>: The researchers demonstrated that hyperparameter space forms a convex optimization landscape with a stable plateau around optimal values. This property ensures that small deviations from ideal settings still produce near-optimal results. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Fixed Final Learning Rate Strategy</b></span>: Unlike conventional approaches that decay learning rates proportionally to their initial values, Step Law employs a constant minimum learning rate (10^-5). This prevents the "left-skew" bias problem where high initial learning rates produce disproportionately large final rates that impair convergence. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Universal Transferability</b></span>: The scaling laws demonstrate remarkable robustness across varied model architectures (dense transformers and Mixture-of-Experts), different sparsity ratios, and diverse data distributions. This universality eliminates the need for domain-specific hyperparameter tuning. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Joint Optimization</b></span>: Unlike previous approaches that optimized learning rate or batch size in isolation, Step Law captures the interdependencies between these parameters for globally optimal configurations. </p></li></ol></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The system's effectiveness was validated through extensive experimentation, achieving results within 0.09% of global optima found via exhaustive searches while reducing the computational overhead of hyperparameter optimization by orders of magnitude. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/d6064c62-930f-4c88-910c-8b2455d4356c/_6F05EC9E-EDFF-4DC5-A79C-4F27FEC2172A_.png?t=1741711151" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="results-and-evaluation" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Results and Evaluation</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> In the paper, the authors present a significant advancement in hyperparameter optimization for LLMs by introducing universal scaling laws for learning rate (LR) and batch size (BS). This study concluded that these scaling laws exhibit topological invariance, maintaining consistent scaling constants across different model scales and data sizes, even when varying the topological features of model architectures. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Additionally, the scaling laws are shown to be effective beyond dense Transformers, extending to sparse Mixture of Experts (MoE) models and <span style="font-weight:700;"><b>maintaining high prediction accuracy</b></span> across various sparsity levels. The robustness of these scaling laws is further evidenced by their performance across diverse data distributions, highlighting their broad applicability in different neural architectures. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/041ea2bc-39c7-4883-b10f-9c79d1488e8f/_D8B5FA9E-BE98-429A-91DA-654C070BB8F4_.png?t=1741711273" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style=""><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-radius:10px;border:solid 0px #DFD150;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28yjf9KIXZdsXoh1WlHvvKnH61OsCUzrX5oXVwy4v9OCqSu4LdOUiyYGT9t274nxQzNkPqq2heK79zS0_YttKjt7OevbI1SV5WLs3SmzYHma/4er/qExZERPuR5CCoMME6COllg/h11/h001.QDl7zy5kk2atq1gDnC8iG7P8I9_DdHSW-zS1r2w_m4I" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;padding:0px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="how-do-language-models-track-state" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">(How) Do Language Models Track State?</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><i>Li et al. [MIT EECS]</i></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 253 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Interpretability </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> bycloud’s pick </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="introduction-to-states-in-language-" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Introduction to States in Language Models</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Language models show impressive abilities to track state changes (like following narratives or executing code), but how they actually accomplish this remains unclear. This paper uses permutation composition tasks, where models must predict final object positions after a series of swaps, as a simplified model for studying state tracking mechanisms. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The researchers discovered that across various model architectures, language models consistently learn one of two algorithmic solutions: either an "associative algorithm" that resembles theoretical constructions from prior work, or a "parity-associative algorithm" that first narrows possibilities using a parity heuristic before refinement. Notably, they found no evidence for step-by-step simulation or fully parallel computation approaches, despite their theoretical viability. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Through careful interventions and training experiments, the researchers demonstrate that these mechanisms can be predicted and even steered through specific intermediate training tasks, which provides valuable insights into how language models might track state when processing language, code, and interactive scenarios. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/7d45d65d-941e-4d9c-ade4-35b99adbcaf8/_2B5A4FD7-A6D1-4FAE-8A08-228A65004DB6_.png?t=1741711374" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="what-mechanisms-do-transformers-use" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">What Mechanisms Do Transformers Use for State Tracking?</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Researchers in this study investigate how language models track state changes, and they proposed four theoretical algorithms these models might implement: Sequential (step-by-step processing), Parallel (constant-depth computation), Associative (hierarchical composition), and Parity-Associative (a two-stage approach combining parity heuristics with associative methods). </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/811d19a3-36c7-4a06-b241-49a47138229c/_30C06EAD-231B-4BDA-8993-BD056899CA47_.png?t=1741711403" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px;"><p>The sequential algorithm composes permutations one at a time from left to right.</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This paper establishes clear "signatures" for each algorithm using two analysis techniques: prefix patching (measuring how much of a prefix must be modified to affect outputs) and probing (testing what can be decoded from intermediate layers). </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> The Parity-Associative Algorithm is a novel hybrid approach where models first compute state parity and then separately calculate the remaining information needed for the final state prediction, demonstrating how language models might combine both heuristic shortcuts and structured algorithmic solutions when tracking complex state changes. </p></td></tr><tr><td id="evaluation-and-results" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;">Evaluation and Results</h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Researchers tested their theoretical algorithms against actual language models trained on permutation tasks, and they found that: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="list-style-type:disc;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Models consistently learned either the Associative Algorithm (AA) or Parity-Associative Algorithm (PAA), and activation patching experiments showed distinct signatures matching these two algorithms </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Linear probing confirmed that in PAA models, state parity is decodable from early layers, while AA models encode parity differently </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> PAA representations can be geometrically decomposed into orthogonal components (parity and "cluster identity") </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> AA models generally demonstrate better generalization to longer sequences </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> Attention pattern analysis revealed "parity heads" in early layers of PAA models (absent in AA models) </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"> AA models develop sparse, tree-like attention patterns in later layers </p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> These findings suggest transformers implement efficient, interpretable state tracking mechanisms that combine both algorithmic structures and heuristic features. </p></td></tr><tr><td align="center" valign="top" style="padding: 20px 28px 20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/4bf4a1c3-0dc0-43d1-8036-03326149d85d/_E1ED7F31-C69E-4120-9375-F36758B9DA0C_.png?t=1741711511" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style=""><tr><td align="center" valign="middle" height="42" style="height:42px;background-color:#2C81E5;border-radius:10px;border:solid 0px #DFD150;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.fUNb4GdFo9D3F8WuLArtoV5sElgytBlvJRzI9WtI92Y-gu5P8gPssC6brKq8Ni_Qd3ctZp9ZpzD6-McIY8PAHgYOrROCwg7duIVpS7Il75lsK3cFYnaev9X2ljrai4h2/4er/qExZERPuR5CCoMME6COllg/h12/h001.xgMmPJ_UvN263sXCKFPXuo4HlMqeRnmTrAJH5JijmpY" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;padding:0px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td class="dd" style="padding: 20px;"><table width="100%" cellpadding="0" cellspacing="0" role="none" style="max-width:520px;margin:0 auto;"><tr><td class="q" style="padding:16px 16px 6px 16px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoDDFT6eh5Nsg0xYVQj-h6I3o9m2k79_qw4izMYhmcI360xzYgitOUeI7MzL5VakazdSXoS5euFlUO_iQYAs3P-oAKTJwxqDsPK45JR6Xr7TdQB494Q866UWucsZHU2Gi0odo4B1LonjKasykUYc15pw/4er/qExZERPuR5CCoMME6COllg/h13/h001.Cf9Q22Rrak9CadUH3IlDbjWByja0pNBDi-aJih1Bwp4" style="text-decoration:none !important;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="100%" style="padding: 0 0 14px 0;text-decoration:none;width:100%;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="36" style="width:36px;"><img src="https://pbs.twimg.com/profile_images/1698572487909400576/BvncwnrP_normal.jpg" alt="tw profile: The AI Timeline" style="display:block;width:36px;height:36px;border-radius:50%;border:0;"/></td><td width="400" style="padding:0 0 0 8px;text-decoration:none;"><span style="display:block;font-size:14px;color:#1c2022;font-weight:700;"> The AI Timeline </span><span style="display:block;color:#697882;font-size:14px;"> @TheAITimeline </span></td><td width="24" align="right" style="vertical-align:text-top;"><img width="24" height="24" loading="lazy" alt="tw" style="border:0;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_logo.png"/></td></tr></table></td></tr><tr></tr><tr><td style="word-break:break-word;"><p>🚨This week's top AI/ML research papers:</p><p>- PokéChamp <br>- (How) Do Language Models Track State? <br>- Muon is Scalable for LLM Training <br>- The First Few Tokens Are All You Need <br>- Optimal Hyperparameter Scaling Law in LLM Pretraining <br>- Phi-4-Mini Technical Report <br>- L1: Controlling How… <span>x.com/i/web/status/1…</span></p></td></tr><tr><td style="padding:12px 0 0 0;"></td></tr><tr><td align="center" style="padding:8px 0 0 0;width:480px;"><img src="https://pbs.twimg.com/media/Glo3fyhb0AAqOGk.jpg" width="480" height="auto" style="display:block;border:1px solid #E1E8ED;border-radius:5px;width:100%;max-width:480px;height:auto;"/></td></tr><tr><td height="8" style="line-height:1px;font-size:1px;height:8px;"> </td></tr><tr><td align="left" valign="top" class="s"><p>12:23 AM • Mar 10, 2025</p></td></tr><tr><td height="10" style="line-height: 1px; font-size: 1px; height: 10px;"> </td></tr><tr><td height="1" bgcolor="#e1e8ed" style="line-height:0px;font-size:0px;height:1px;"></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td align="left" valign="top" class="s"><p><b style="color:#1C2022">471</b> Likes <b style="color:#1C2022">53</b> Retweets </p></td></tr><tr><td align="left" valign="top" class="s"><div align="center" style="text-align:center;margin-top:4px;margin-bottom:4px;padding:8px;border:1px solid #ccd6dd;border-radius:9999px;color:#1B95E0"><b>1 Reply</b></div></td></tr></table></a></td></tr></table></td></tr></table></td></tr><tr><td class="b" align="center" valign="top" bgcolor="#2a2a2a" style="padding:0px;border-bottom-left-radius:0px;border-bottom-right-radius:0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" bgcolor="#73ddff" style="padding:12px"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><span style="padding-left:1px;"></span></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCQqcWoV4NNHHr5SkP9THApWuHAAlWLQxI3Q_IqFmt_DcyAxeC8jDApCnHmMSBGpBb5sgtimvBYgxRX-Rp7s0F3LjCHoSwdhr83OBqRFhJ1y_/4er/qExZERPuR5CCoMME6COllg/h14/h001.01i4iWirCD-ci4YyyhmvyzxcUXoHNY30h2Q8vXZ6oOE" style="text-decoration:none;"><img width="22" alt="tw" border="0" style="display:block;max-width:22px;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_dark.png"/></a></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmBoQnQ9VXnB2zTxBG4HeHBgjMqVxpoXRdj01cjwyoVlHgiebEOgBvwHtevoVpsSvpn3Q1di2ml6sb3cBM-X6IStQbj_zQSVGWJ8AAmPw2en2/4er/qExZERPuR5CCoMME6COllg/h15/h001.IlmsDPaEz_EhSvUq7CTzyz4pktf-Zn2HRGRthPrPdyI" style="text-decoration:none;"><img width="22" alt="yt" border="0" style="display:block;max-width:22px;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/youtube_dark.png"/></a></td><td><span style="padding-left:1px;"></span></td></tr></table></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td class="w" align="center" valign="top" style="padding:15px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> Update your email preferences or unsubscribe <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsBhEpz-DJgyVFmavJPa0OyKRRnvw4o7XGyvIv7PRofnmpu9atCfEKnwP3Z1mnofq4bXBr2BzFVnieVx5kC7BlXPu6UHUGwDrqjl2UF6fYz6gAFbzUlpiXnJO6kOPBg--NRiKxL-UrGJVtLnNdDCUP6QMuC3aY01BuNBPnvtYVFApcFBnLD_OdI5KZxzVHo67bMaVrTdME9YnSUF859OIiFwEZTV7u9-ew50kEs7q8GohS9AUrACyVe0g6qhAExTXz_IBlgUVqiHNU8zEV53PZHCQrEV3uLa98VV7ZHAxL6y7QEd0Vy_aQovQ6LR55IBHyZaowlrZEEQyrKl6dxVzGD8vEcSWgEURIzDLKsxz845w3fA2IK546mP_2Pd9A6WkI-Aa7nzg_loSwWmjCEDoUL_m_OjNLzkcyf5_FplJy7Z9G6JgFGHCInLh-x419aVXVnKGbUFP1NBZ3FhhjJgKlTjPrLTlyDpdzXa6n5ASoEWHH5zFj2HIHgLi-Y3vdy7Ro_pcbjjkifInQdUJQr2CpAcoH2-WUb4jgYSQCc5cyWOOuEcntlCH7A9s-lf0HTW0yqACRPNo2sHrf-NGY43iQJ1FJvWb7O__6424blhJMTh97ZYvBbiDLqojlw_9gwcG-0GM_rX5b_Nax3HfKJl06GxextsG10HOjoKaTCMrWAgrB8izE5Oq5EbAGyQALdUKWuqYg1umzt5eL4bWq0FhvalXjCwG_LkAy-xn-CImLREq8bycpAhs3cuHAzbb5kI2Zg/4er/qExZERPuR5CCoMME6COllg/h16/h001.sgoxom09CGEmoTGs2yZRIICO03iYsnQjsIl23zbC5fY" style="text-decoration:underline;text-decoration-color:#FFFFFF!important;color:#FFFFFF!important;"> here</a></p><p class="copyright" style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> © 2025 bycloudai </p><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> 228 Park Ave S, #29976, New York, New York 10003, United States </p></td></tr><tr style="display: table-row !important;"><td align="center" valign="top" style="padding-top:20px;" style="display:table-cell !important;"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="display:table !important;"><tr style="display:table-row !important;"><td class="u" align="center" valign="middle" height="32" style="height:32px;display:table-cell !important; max-height: 32px !important;margin:0px !important; background-color: #ffffff !important;"><a style="line-height:32px !important;text-decoration:none;display:block !important;" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28olDWFpV5DDKfdk_OdOKOhwbHJziM-SedaVqwCy-7xUBk6LgMIEqMLrZRSuZx_I15tiJ8cKjoZWOtu6JGh7k7QEjgeU3Iv_opW_m4_BzCCkMcPSl6VrG2N0ad2ICA4dbTL4bQ2ecXXNoxxoDZLrSvm-GTCJtdtGNBRdcwQU7fZVbDhSWhHpH4MCADfVD1cIeAhrhvCcVEMPJCNRzCn4mkcDAPcukM-ejbVWNfRxpYp-/4er/qExZERPuR5CCoMME6COllg/h17/h001.3eNssRQUITpqOnkCku6tpVn76pjBhNsmq2kh1Bosyr8"><img src="https://media.beehiiv.com/output-onlinepngtools.png" width="16" alt="beehiiv logo" style="display:inline-block !important;max-width:16px !important; vertical-align:-3px !important;width: 16px !important;" border="0"/><span style="padding-left:11px !important;display: inline-block !important;">Powered by beehiiv</span></a></td></tr></table></td></tr><tr><td align="left" valign="top" height="2" style="height:2px;"><a href='https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWsHIaP4XNp0WgUYqLvHcKk_3uqk_KIkz4ddLinhFbud6JuxLFdSUhYnR7b1NSsmbtzXNGNblnEEMKUtkCAjkn8Y/4er/qExZERPuR5CCoMME6COllg/h18/h001.Mr4bBeAPs-ZLssBFyUWsoLn_B35_RZF2-37fzjhMMcs' style="color: #2a2a2a !important; cursor: default; font-size: 1px; text-decoration: none;"> Terms of Service </a></td></tr></table></td></tr></table></td></tr></table></td></tr></table></td></tr></table></td></tr></table></div></body></html>