<!DOCTYPE html><html lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" style="font-size:16px;"><head></head><head><meta charset="utf-8"/><!--[if !mso]><!--><meta http-equiv="X-UA-Compatible" content="IE=edge"/><!--<![endif]--><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="x-apple-disable-message-reformatting"/><meta name="format-detection" content="telephone=no,address=no,email=no,date=no,url=no"/><meta name="color-scheme" content="light"/><meta name="supported-color-schemes" content="light"/><title>TTRL: Test-Time Reinforcement Learning</title><!--[if mso]><xml><o:OfficeDocumentSettings><o:AllowPNG/><o:PixelsPerInch>96</o:PixelsPerInch></o:OfficeDocumentSettings></xml><![endif]--><style> :root { color-scheme: light; supported-color-schemes: light; } body { margin: 0; padding: 0; min-width: 100%!important; -ms-text-size-adjust: 100% !important; -webkit-transform: scale(1) !important; -webkit-text-size-adjust: 100% !important; -webkit-font-smoothing: antialiased !important; } .body { word-wrap: normal; word-spacing:normal; } table.mso { width: 100%; border-collapse: collapse; padding: 0; table-layout: fixed; } img { border: 0; outline: none; } table { mso-table-lspace: 0px; mso-table-rspace: 0px; } td, a, span { mso-line-height-rule: exactly; } #root [x-apple-data-detectors=true], a[x-apple-data-detectors=true], #MessageViewBody a { color: inherit !important; text-decoration: inherit !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important; } span.MsoHyperlink { color: inherit !important; mso-style-priority: 99 !important; } span.MsoHyperlinkFollowed { color: inherit !important; mso-style-priority: 99 !important; } .a { background-color:#dedede; } .b { background-color:#2a2a2a; } .c { background-color:#ffffff; } .d { background-color:#fff0c8; } .d2 { background-color:#FFFFFF; } .d3 { background-color:#FFFFFF; } h1 a { text-decoration:none;color:#2C81E5;font-style:italic; } h2 a { text-decoration:none;color:#2C81E5;font-style:italic; } h3 a { text-decoration:none;color:#2C81E5;font-style:italic; } h4 a { text-decoration:none;color:#2C81E5;font-style:italic; } h5 a { text-decoration:none;color:#2C81E5;font-style:italic; } h6 a { text-decoration:none;color:#2C81E5;font-style:italic; } h1, h1 a, h2, h2 a, h3, h3 a, h4, h4 a, h5, h5 a, h6, h6 a, ul, li, ol, p, p a { margin: 0;padding: 0; } h1 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:28px;color:#2A2A2A;line-height:42px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h2 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:700;font-size:24px;color:#2A2A2A;line-height:36px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h3 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:20px;color:#2A2A2A;line-height:30px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h4 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:18px;color:#2A2A2A;line-height:27px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h5 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:16px;color:#2A2A2A;line-height:24px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } h6 { font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif;font-weight:400;font-size:14px;color:#2A2A2A;line-height:21px;padding-bottom:4px;padding-top:16px;mso-margin-top-alt:16px;mso-margin-bottom-alt:4px } p { font-family:'Georgia','Times New Roman',serif;font-weight:400;color:#2D2D2D;font-size:16px;line-height:24px;padding-bottom:8px;padding-top:8px;mso-margin-top-alt:8px;mso-margin-bottom-alt:8px; } p a, .e a, ul a, li a, .h a, .h2 a, .h3 a { word-break:break-word;color:#2C81E5 !important;text-decoration:none;font-style:italic; } p a span, .e a span, ul a span, li a span { color: inherit } p .bold { font-weight:bold;color:#2D2D2D; } p span[style*="font-size"] { line-height: 1.6; } .f p { font-size:12px;line-height:15px;color:#2D2D2D;padding:0; } .f p a { color:#2D2D2D !important; } .g p { font-family:'Helvetica',Arial,sans-serif;font-size:14px;line-height:20px;font-weight:normal;margin:0; } .g p a { text-decoration: underline; } .i p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i p a { color:#2D2D2D !important; } .i2 p { font-family:'Helvetica',Arial,sans-serif;line-height:23px;font-size:15px;color:#2D2D2D; } .i2 p a { color:#2D2D2D !important; } .i3 p { font-family:'Helvetica',Arial,sans-serif;line-height:43px;font-size:24px;color:#2D2D2D; } .i3 p a { color:#2D2D2D !important; } .h p a { color:#595959 !important; } .h2 p a { color:#595959 !important; } .h3 p a { color:#595959 !important; } .f p a, .i p a, .i2 p a, .i3 p a, .h p a, .h2 p a, .h3 p a { text-decoration:underline; } .j { border-top:3px solid #ffeb2d; } .k p { padding-left:15px;padding-bottom:0px;padding-top:6px;mso-margin-top-alt:6px;mso-margin-bottom-alt:0px;mso-margin-left-alt:15px; } .o { background-color:#FFFFFF;border:1px solid #F1F1F1;border-radius:5px; } .o p { font-family:'Helvetica',Arial,sans-serif;padding:0px;margin:0px; } .l p, .l p a { font-size:14px;line-height:20px;font-weight: bold;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .m p, .m p a { font-size:13px;line-height:18px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .n p, .n p a { font-size:12px;line-height:17px;font-weight:400;color:#2D2D2D;padding-bottom:6px;mso-margin-bottom-alt:6px;text-decoration:none; } .p { background-color:#FFFFFF;max-width:520px;border:1px solid #E1E8ED;border:1px solid rgba(80, 80, 80, 0.3);border-radius:5px; } .q { font-size:16px;font-family:Helvetica,Roboto,Calibri,sans-serif !important;border:1px solid #e1e8ed;border:1px solid rgba(80, 80, 80, 0.3);border-radius:10px;background-color:#FFFFFF; } .q p { font-size:16px;font-family:system-ui,Helvetica,Roboto,Calibri,sans-serif !important;color:#222222;padding:4px 0; } .r { border:1px solid #E1E8ED !important;border-radius:5px; } .s p { font-size: 14px; line-height: 17px; font-weight: 400; color: #697882; text-decoration: none; } .t p { font-family:'Helvetica',Arial,sans-serif;font-size:12px;line-height:18px;font-weight:400;color:#000000;font-style:italic;padding:4px 0px 0px; } .v { border-radius:10px;border:solid 0px #DFD150;background-color:#2C81E5;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;color:#FFFFFF; } .v a { text-decoration:none;display:block;color:#FFFFFF; } .w p { font-size:12px;line-height:15px;font-weight:400;color:#FFFFFF; } .w p a { text-decoration: underline !important;color:#FFFFFF !important; } ul { font-family:'Helvetica',Arial,sans-serif;margin:0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:disc;font-size:16px; } ul > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:disc; } ol { font-family:'Helvetica',Arial,sans-serif;margin: 0px 0px 0px 25px !important;padding:0px !important;color:#2D2D2D;line-height:24px;list-style:decimal;font-size:16px; } ol > li { font-family:'Helvetica',Arial,sans-serif;margin:10px 0px 0px 0px !important;padding: 0px 0px 0px 0px !important; color: #2D2D2D; list-style:decimal; } .e h3, .e p, .e span { padding-bottom:0px;padding-top:0px;mso-margin-top-alt:0px;mso-margin-bottom-alt:0px; } .e span, .e li { font-family:'Helvetica',Arial,sans-serif;font-size:16px;color:#2D2D2D;line-height:24px; } .rec { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji" !important; } .rec__button:hover { background-color: #f9fafb !important; } .copyright a {color: inherit !important; text-decoration: none !important; font-size: inherit !important; font-family: inherit !important; font-weight: inherit !important; line-height: inherit !important;} .txt_social p { padding: 0; word-break: break-all; } .table, .table-c, .table-h { border: 1px solid #C0C0C0; } .table-c { padding:5px; background-color:#FFFFFF; } .table-c p { color: #2D2D2D; font-family:'Helvetica',Arial,sans-serif !important;overflow-wrap: break-word; } .table-h { padding:5px; background-color:#F1F1F1; } .table-h p { color: #2A2A2A; font-family:'Trebuchet MS','Lucida Grande',Tahoma,sans-serif !important;overflow-wrap: break-word; } @media only screen and (max-width:667px) { .aa { width: 100% !important; } .bb img { width: 100% !important; height: auto !important; max-width: none !important; } .cc { padding: 0px 8px !important; } .ee { padding-top:10px !important;padding-bottom:10px !important; } .ff ul, .ff ol { margin: 0px 0px 0px 10px !important;padding: 0px !important; } .ff li { margin:10px 0px 0px 10px !important; } .r {height:140px !important;} .s p { font-size:13px !important;line-height:15px !important; } .mob-hide {display:none !important;} .mob-stack {display:block !important;width:100% !important;} .mob-w-full {width:100% !important;} .mob-block {display:block !important;} .embed-img {padding:0px 0px 12px 0px !important;} .socialShare {padding-top:15px !important;} .rec { padding-left:15px!important;padding-right:15px!important; } .bodyWrapper { padding:7px 4px 7px 4px !important; } .social-mobile {float:left !important;margin-top:10px !important;} } @media screen and (max-width: 480px) { u + .a .gg { width: 100% !important; width: 100vw !important; } .tok-heart { padding-top:75% !important; } .tok-play { padding-top: 250px !important; } } @media screen and (max-width: 320px) { .tok-heart { padding-top:65% !important; } } .u { border: 1px solid #CACACA !important; border-radius: 2px !important; background-color: #ffffff !important; padding: 0px 13px 0px 13px !important; font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif !important;font-size: 12px !important; color: #767676 !important; } .u a { text-decoration: none; display: block !important; color: #767676 !important; margin: 0px !important; } .u span, .u img { color: #767676 !important;margin:0px !important; max-height:32px !important;background-color:#ffffff !important; } </style><!--[if mso]><style type="text/css"> sup { font-size: 100% !important;vertical-align: .5em !important;mso-text-raise: -1.5% !important;line-height: 0 !important; } ul { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ul li { margin-left: 0px !important; mso-special-format: decimal; } ol { margin-left:0px !important; margin-right:10px !important; margin-top:20px !important; margin-bottom:20px !important; } ol li { margin-left: 0px !important; mso-special-format: decimal; } li.listItem { margin-left:15px !important; margin-top:0px !important; } .paddingDesktop { padding: 10px 0 !important; } .edm_outlooklist { margin-left: -20px !important; } .embedImage { display:none !important; } </style><![endif]--><style> @font-face { font-family: 'Open Sans'; font-style: normal; font-weight: 700; font-display: swap; src: url('https://fonts.gstatic.com/s/opensans/v40/memSYaGs126MiZpBA-UvWbX2vVnXBbObj2OVZyOOSr4dVJWUgsg-1x4gaVIUwaEQbjA.woff2') format('woff2'); } @font-face { font-family: 'Open Sans'; font-style: italic; font-weight: 700; font-display: swap; src: url('https://fonts.googleapis.com/css2?family=Open+Sans:ital,wght@1,700&display=swap') format('woff2'); } </style></head><body class="a" style="margin:0px auto;padding:0px;word-wrap:normal;word-spacing:normal;background-color:#dedede;"><div role="article" aria-roledescription="email" aria-label="email_name" lang="en" style="font-size:1rem"><div style="display:none;max-height:0px;overflow:hidden;"> Plus more about Process Reward Models That Think and PHYBench: Holistic Evaluation of Physical Perception and Reasoning in Large Language Models  ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ </div><table role="none" width="100%" border="0" cellspacing="0" align="center" cellpadding="0" class="gg"><tr><td align="center" valign="top"><table role="none" width="670" border="0" cellspacing="0" cellpadding="0" class="aa" style="width:670px;table-layout:fixed;"><tr><td class="bodyWrapper" align="center" valign="top" style="padding:7px 7px 7px 7px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" style="border-width:0px 0px 0px 0px;border-style: solid; border-color: #2a2a2a;border-radius:10px 10px 0px 0px;background-color:#ffffff;" class="c"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr id="header"><td style="padding:28px 28px 0 28px;"><div style="padding-top:0px;padding-right:0px;padding-bottom:20px;padding-left:0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td class="f" align="right" valign="top"><p> April 29, 2025 | <a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxSdB5RCIH6yy1Fm1CYma3Ex2VVfKY346A8adMdWroClaH8g6Mx3DWx-K0zYDj5OJH262C-uCfQ8K5TcTnhdsgLA3BXAJC3QhN0BGuSRtXwdls2P_4QW-hr5wrsOgvll3CgVKjD2lhd3uRQPS1y1Lf3NWY8MsfS9SBqiTThGUHEWO35j_DAPsPCi1jpm5Dugkey1hq5BW3v08j90OnEXaCF5a375ojeLXADkH8M0TC44p6PxjLL3jm2uf3Rnr1ivq2b5K_HI1AQpBNHoi4f07X7bgSEJTl6ZfEfExfKxpOiS-20l7w3V37Wq2DbabQGqg1pCVOF3Skh4xRAtN3cVjQFOcYnPwammkjI8R3MUklCk6FgSeRKStwZODedT2d07gMoyzG-utnqj8jKJbf7dWIx3Ww1gGYVzOYyPhiwUxrbhLoyJWTyvNVikaajxFNb-y_l2EsSD0p5bQ5LgprhC8hiNldXqR8Rme5GP2TIj6SVczRWmU3TErKTd1cWd1sAMJpVBAA_YVoGmuRhOxQo-JBXPGD_Sgk7q6r6vuRNpvwOPh6yJcJjlPmLtySrL8iXUvsbsBBKPDV1QDRWkLT0VJwGUknhq0MROkoIDAAIz_p1-NdWqzghf2XNvdsGr64dJGxgmFJWeHibj_YsBD6w7LaGU/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h0/h001._cBs3pVHsH53SBHidACVutvC5GQ2qxGFnHhE93iLxyI"><span class="translation_missing" title="translation missing: en.templates.posts.email.v3.header.read_online">Read Online</span></a></p></td></tr><tr><td class="dd" align="center" valign="top" style="padding:15px 0;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><h1 style="text-align:left;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;font-weight:Bold;font-size:32px;color:#2A2A2A;padding:2px 0;line-height:38px;"> TTRL: Test-Time Reinforcement Learning </h1><p style="text-align:left;font-family:'Helvetica',Arial,sans-serif;font-weight:normal;font-size:20px;color:#3E3E3E;padding:5px 0;line-height:24px;"> Plus more about Process Reward Models That Think and PHYBench: Holistic Evaluation of Physical Perception and Reasoning in Large Language Models </p></td></tr></table></td></tr><tr><td style="height:0px;width:0px;"><div style="height:1px;" data-open-tracking="true"> <img src="https://elink4f7.mail.bycloud.ai/ss/o/u001.3wmUuY8gEWd4_869a_eXcg/4g2/dNb_B7-3TOqMXUk2Pq45KQ/ho.gif" alt="" width="1" height="1" border="0" style="height:1px !important;width:1px !important;border-width:0 !important;margin-top:0 !important;margin-bottom:0 !important;margin-right:0 !important;margin-left:0 !important;padding-top:0 !important;padding-bottom:0 !important;padding-right:0 !important;padding-left:0 !important;"/> </div></td></tr></table></div></td></tr><tr id="content-blocks"><td class="email-card-body" align="center" valign="top" style="padding-bottom:28px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td id="nov-18-th-nov-24-th-33-latest-ai-re" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h6 style="color:#2A2A2A;font-weight:normal;"><i>Apr 21st ~ Apr 27th</i><br><i>#53 Latest AI Research Explained Simply</i></h6></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="industry-news-in-1-line" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">🗞️ Industry News in 1 Line</h2></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 7.3k</span></span> <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.wcXdj6dB6nd1Cx4inzJNk_td6EHXs7wvRPySUkyhYFcE8MHwqtvoYgf4Nsjuw7-IxVJ04UA5BnePOx0ronYcastRtC0G36qPs7Yzj9UYB6udHvmG_vHkVKZ1-HUKnICk/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h1/h001.fRr7gzncr6EvgXJkIWJrnDTsjCK1T-zgw3fpWOuLFqE" target="_blank" rel="noopener noreferrer nofollow"><span>Qwen has released Qwen3</span></a>, a new series of eight open-weight large language models ranging from 235B-A22B (MoE), 30B-A3B (MoE), 32B, 14B, 8B, 4B, 1.7B, and 0.6B parameters. The flagship Qwen3-235B-A22B model achieves competitive results against other OS models. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:450px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/d4dba8e7-3bab-4229-bfce-04fa7699f349/Gppj9_kbEAAkO9U.jpg?t=1745960345" alt="Qwen3-235B-A22B benchmark" height="auto" width="450" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:450px;"><p>Qwen3-235B-A22B benchmark</p></td></tr></table><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"></p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 12k</span></span> OpenAI has enhanced ChatGPT with <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCVkmoyf5Qo7ePmgXlKGAl8tp_wA2iAx4nP9hng6iYl5-sI7iuUb7RWzCPW4ejLrzoedA0cFYTzc9I_uiavfFbU61I_9zwD0B81xERse3j37u4Hjp7y9MuVbt4Rcqi3vtng/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h2/h001.ZPkVPE967Y9fQKT94yV_UGx8iN-7Uk-t9V66y8JnYO0" target="_blank" rel="noopener noreferrer nofollow"><span>improved search features</span></a> like better citations and WhatsApp integration, alongside introducing a <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCVkmoyf5Qo7ePmgXlKGAl8vWo2MixjnGEvzz5ogT0C-j27qM65-QWTZwbhOii9XuJns9GNXmyiIo5ObeFEE-bzBERam5t0Ow-fZTn88ijhpLCXT2HVVgE07ugOPRuKKXEA/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h3/h001.SI48XZYL1WBSlMO5_Gv73mIFjdPiesvJhuNwV0WwiRg" target="_blank" rel="noopener noreferrer nofollow"><span>new lightweight version</span></a> to expand access to its popular "deep research" capability for both paid and free users. Additionally, an improved shopping experience featuring detailed product information and direct purchase links is currently rolling out to all user tiers. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:480px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/5e5f99da-a028-4711-9374-bcfc88823845/Screenshot_2025-04-29_171010.png?t=1745961021" alt="" height="auto" width="480" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:480px;"><p>ChatGPT native shopping</p></td></tr></table></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;">♥ 1.1k</span></span> <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.sa7HIrCkEzyny8lstY55mAXsWWiqRLG5rGk9g8WJDhHte5zRXNjcLts3YqGHKtxCTOq5Cg4p0_X-yOrJwpV6G0HimklGADRvlj21koqgkKgkvYTt7ob2s5Wvn_z0usPV/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h4/h001.YTrvltn_-0FROn-28PCktKh8o2qZ_-ZTTd0k1bESFp4" target="_blank" rel="noopener noreferrer nofollow"><span>NousResearch</span></a> has introduced Minos, a new binary classifier designed to detect refusals from LLMs by estimating the likelihood that a response constitutes a refusal, potentially aiding redteamers and jailbreakers. Built upon AnswerAI's ModernBERT-Large 400M model for quality and speed, <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWkKposCjC6uDdnsLruTBRxeCp5vm0sRKvWeEbNGTVAYWOz9QLIb9QEZtum1W_orO2dLDeXkGIDGXVmiXHSrqbDbpArgh1S348HtTin9RgkWtVNDkDu1krE8lQXEFmuLh7w/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h5/h001.ucfF1KS46axoLGmtCvMmCgcDxrb0VOtERVLualUFqeA" target="_blank" rel="noopener noreferrer nofollow"><span>Minos is available on HuggingFace</span></a> along with example scripts for usage. </p><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:480px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/c1affbe9-3512-41d9-838f-dc48a1499081/GpUc7CUagAAoyfS.jpg?t=1745961357" alt="" height="auto" width="480" style="display:block;width:100%;" border="0"/></td></tr></table></li></ol></div></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="transparent" style="background-color:transparent;border-color:#2C81E5;border-style:solid;border-width:5px;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;"><span style="">bycloud’s new project: search AI papers semantically</span></h2></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1r-Oo1v2APbJapoJieKk3Wpe7ldTezUu5UQjr3YxkjDr64dknbQx1-38CKAMc8wQVG5Tk8oQAyiI7e9mg3sBDvoe5v43TFe36B13zOTMDEQ/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h6/h001.hl4yqyBhQUOLkxZXRT_kT4cKfFrj9uPBNZ07aXD1eKA" rel="noopener noreferrer nofollow" style="text-decoration:none;" target="_blank"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/0f46c7c1-9068-4af6-882f-1c91dd0370d7/Screenshot_2025-04-22_122639.png?t=1745339251" alt="" height="auto" width="600" style="display:block;width:100%;border-radius:0px 0px 0px 0px;border-style:solid;border-width:0px 0px 0px 0px;box-sizing:border-box;border-color:#E5E7EB;" border="0"/></a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">Hey guys! I am very excited to share with you my latest project that I just shipped, called:</span></p></td></tr><tr><td class="dd" align="center" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:center;"><h2 style="color:#2A2A2A;font-weight:Bold;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1r-Oo1v2APbJapoJieKk3Wpe7ldTezUu5UQjr3YxkjBTNAZ-93a5i8IJfW4-u9rJZxkf6XD6ekQp-u8OhENEm59imNEQOPS73uHpJ7HDfy4/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h7/h001.kqqTcvttvJYMWoydbXPaHHQ6rvDbasBu_XW0YN638C4" target="_blank" rel="noopener noreferrer nofollow"><span>findmypapers.ai</span></a></span></h2></td></tr><tr><td class="dd" align="center" style="padding:0px 28px;text-align:center;word-break:break-word;"><p style="line-height:24px;"><span style="">A semantic search engine</span><span style=""><b> </b></span><span style="">for 300k+ AI research papers!</span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><b>Outcompete Deep Research apps</b></span><span style=""> like Grok, OpenAI, Perplexity, and Gemini at finding relevant papers. Check out my demo video on YouTube:</span></p></td></tr><tr><td class="dd" align="center" valign="top" style="padding:20px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmCSDVrazcsfChikC1Hy1NlH2PQHxY8iiO_nPIdJTlo2PwnvqFZoEIb9aYF-NQTG9kB-aMJXvylfRxmwVf-v8YYTkYMX4UykMdxCG1WFTu6N0/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h8/h001.v597yiGZvv6Ezq4aJuh739SDDrD_ouc77-ZwlC-38LQ" style="text-decoration:none;"><table align="center" width="100%" cellpadding="0" cellspacing="0" border="0" role="none" style="max-width:520px;margin:0 auto;"><tr><td class="p" width="100%" style="padding:2px;border:none;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td align="center" valign="top" style="width:100%;"><div style="max-height:0;position:relative;opacity:0.999;width:100%;mso-hide:all;"><div style="display:inline-block;width:100%;padding-top:25%;"><img width="20%" height="auto" loading="lazy" alt="" style="border:0;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/youtube_play_icon.png"/></div></div><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmCSDVrazcsfChikC1Hy1NlGs5AonJnymKAIboweUgZ19vv4votegK56ZE9RUe_JYMQxsRJ9oZQovTrWEPyQDQkvPQO_cCcw-aMQDAdlS3piQ/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h9/h001.YGz1JkL-B-cAAiHR5Q00Kg5sKYqT6270vn5MqvEFEjk" style="text-decoration:none;"><img src="https://i.ytimg.com/vi/KL5Au8pKJ38/maxresdefault.jpg" width="480" height="auto" loading="lazy" alt="YouTube video by bycloud" style="display:block;height:auto;border:0;outline:none;text-decoration:none;background-color:#000000;width:100%;"/></a></td></tr><tr><td><p style="font-size:12px;font-weight:500;font-style:italic;font-family:Helvetica, Calibri, sans-serif;color: #686a6d; padding-top:0 !important;padding-bottom:6px !important; padding-left:4px !important;"> AI Deep Research sucks at finding papers... So I built a better one </p></td></tr></table></td></tr></table></a></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">Specifically, there are </span><span style=""><b>~300,000 </b></span><span style="">AI/ML research papers currently indexed in my engine, and by next month, we are planning to increase this by 4x, indexing the entire </span><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.K3cROzwawwm7YuV5pc0SJXfxtLihxcTaidFjx3CIKRzwk63VM1qlc7kC2GmOBrYg-uQAActhMuP4Ha7epiynNViVnJVxu1mFbUqvBcx5Aes/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h10/h001.kk3KyXY9vf9pMQUnr5iyAywLc_MoTPGaLBetRt7QdE4" target="_blank" rel="noopener noreferrer nofollow"><span>Arxiv.org</span></a></span><span style="">. </span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i><b>But why ANOTHER search engine</b></i></span><span style=""><b>?</b></span><span style=""> So there are currently 2 problems for each existing solution:</span></p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Generative AI models trained with papers is built for serving hallucination</span></p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Deep Research Agents is good but wasting compute by browsing 80% SEO optimized slop</span></p></li></ol></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1r-Oo1v2APbJapoJieKk3Y7-_bQjp1fXYJa9abKZXRUgyOs1NPSA74uaogeIHbgopbVpZ_VynKBPt72DltmApNNJhGKptQzw6LEjgr_5wTg/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h11/h001.gGGc4vbme3RYlSgQunOHofw-aixvmdiKMANf2Bh7_mo" target="_blank" rel="noopener noreferrer nofollow"><span>findmypapers.ai</span></a></span><span style=""> addresses both of these problems, and takes the best of both worlds.</span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">I believe that surveying research shouldn’t be that hard. You can be as specific and technical about your search query, and it’ll not give you made up unuseful bs.</span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">Before u try:</span></p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Search time is long (est. 1~3min depending on search breadth)</span></p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Limited to AI research papers, but will be expanding soon</span></p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="">Broad/wide search is really REALLY useful if you need big compilation of information like my own use cases</span></p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="">To celebrate our launch, us code </span><span style=""><b>BETAN50 </b></span><span style="">for 50% off for the next 2 months! You can follow our official </span><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCYfUZkPCVN-dWoLNWR_ZMb_doXnIaIcMVbwXdkB4cr2YfIXSLmfL5_kJtoxgXDNoI3BY9hlplzWMX9BDXz8A5pkI0L4iVuxL_TKxqAqtcaeZ/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h12/h001.a4L_cYg2ThPuIoC52oeHZ_xBT0wSrNxOalTiMahCj_c" target="_blank" rel="noopener noreferrer nofollow"><span>X account</span></a></span><span style=""> or </span><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.9ggl6Mt0xphuuMReR5gVpWPwAndoLzTm9asQYWRGsHurBY7u08OKEJWmmW9BgfmCCytIDpcDyMus-wNgIKcNzGVVqWeRIMxLMWcsEP5ialNUYt1fn7lWi6QSuqsByBk2/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h13/h001.inMqw4suZMSe2EhCNlLSXeZxVLhh9RwD7Svm2dIixHM" target="_blank" rel="noopener noreferrer nofollow"><span>Discord</span></a></span><span style=""> for any updates, and feedbacks are really appreciate it. </span></p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j29rweNtSHTMPdCBNsuu15hSzQcOxkOpREY7CmsUqazs5MMa1uyTWQcHA44VJKvv5LOrj73ya7S6uuAAMt4LiYwQ-PzqQ4QspN0c-91wcDG-u/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h14/h001.xC_6sl3rM0JY2iJX6gkeLWrV2Tm0RQIpCllTS9TeWBg" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Check Out FindMyPapers </a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoGymQ3NNPtd5dE5MV_8UgjIDFPVXngz8pvQBldSW42yhUe_Qiq6DgEPMEBuPL9yfRpXelTiuu2kS8pLFvsoem_XoZoy_n13sTKUhZIbl0VH6/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h15/h001.1nxUZRUMBEk1jadjXUU9I1bDs_etSpYLNLaS2O9XeMI" target="_blank" rel="noopener noreferrer nofollow"><span>Advertise with The AI Timeline! </span></a></span></p></td></tr></table></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="ttrl-test-time-reinforcement-learni" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">TTRL: Test-Time Reinforcement Learning</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Zuo et al. [Tsinghua University, Shanghai AI Lab]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 554 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM RL </span></span></p></td></tr><tr><td id="training-ll-ms-without-a-teacher" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Training LLMs Without a Teacher</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> LLMs are getting better at solving complex tasks but sometimes they struggle with new test data which wasn’t part of their training dataset. Traditional RL trains models using clear reward signals derived from labeled data. But in dynamic environments, labels are scarce or nonexistent. Existing methods, such as majority voting during test-time scaling, help refine outputs but don’t actively improve the model’s underlying reasoning. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This paper introduces <span style="font-weight:700;"><i><b>Test-Time Reinforcement Learning</b></i></span><span style="font-weight:700;"><b> (TTRL)</b></span> approach by letting models teach themselves using nothing but raw test data. Instead of relying on external labels, the model samples multiple responses to a problem, votes on the best answer internally, and uses that consensus to guide its own learning. This turns unlabeled test data into a self-sustaining training loop. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/335b16d4-5672-4a06-a6f8-798e72fdf8ea/teaser.jpg?t=1745940384" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="how-test-time-reinforcement-learnin" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">How Test-Time Reinforcement Learning (TTRL) Turns Guesses Into Lessons</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> TTRL operates like a group of students that is debating solutions. For each input (e.g., a math problem), the model generates multiple candidate answers. A majority vote selects the most popular response, which becomes a pseudo-label. The model then rewards itself for answers matching this consensus and penalizes deviations. Over time, this feedback loop sharpens its reasoning. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/8cdd9a77-0a08-4429-93e9-7219d4ec79dd/ttrl_reward.png?t=1745940415" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This uses three key mechanisms: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Repeated Sampling</b></span>: By generating dozens of potential answers, the model explores diverse reasoning paths. This diversity ensures the majority vote isn’t skewed by repetitive errors. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Reward From Consensus</b></span>: Answers aligning with the majority earn a reward of 1; others get 0. Though simplistic, this binary signal stabilizes training by making it more consistent. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Policy Updates</b></span>: Using RL algorithms like GRPO or PPO, the model adjusts its parameters to favor high-reward behaviors. Crucially, this happens <span style=""><i>during inference</i></span>, allowing real-time adaptation. </p></li></ol></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/95e63edc-f506-473c-9a6f-31437bc04086/overview.png?t=1745940397" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="benchmarking-results-for-test-time-" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Benchmarking Results for Test-Time Reinforcement Learning (TTRL)</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Researchers tested TTRL on mathematical benchmarks like AIME 2024 and MATH-500 and it performed remarkably well. For instance, Qwen-2.5-Math-7B saw a <span style="font-weight:700;"><b>159% jump</b></span> in pass@1 accuracy on AIME after TTRL training, matching models trained with full supervision. These improvements generalized across tasks: a model fine-tuned on math problems also improved its performance on unrelated reasoning benchmarks. </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Scaling Wins</b></span>: Larger models benefit more from TTRL. The 7B model outperformed its 1.5B counterpart by wider margins, as its capacity to generate accurate consensus labels fueled better rewards. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Surpassing the Vote</b></span>: Models consistently exceeded the performance ceiling set by their own majority votes. This "bootstrapping" effect suggests TTRL doesn’t just amplify existing knowledge, but it uncovers new reasoning strategies. </p></li></ul></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/8777d1a3-80c3-42a6-98cb-1429951c201a/image.png?t=1745940587" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> However, TTRL isn’t foolproof. Its success hinges on the model’s initial competence: weaker models (e.g., LLaMA-3.1-8B) showed minimal gains on harder tasks, likely due to insufficient prior knowledge. Hyperparameters like temperature and batch size also require careful tuning, too little exploration leads to stagnation, while too much introduces noise. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28yjf9KIXZdsXoh1WlHvvKnMIzj2D87R08j3j9CcC6UtpIb1TTlAM5JGGXqpaGjNfQpBGKKoJRpVUwUnF6UBJ7Ki_GS5-td7COFKWl_q-oWO/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h16/h001.-A5PAZMysRuC8AvC4TgFODRvWhV_hrvCSDIZ9Frl5As" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="process-reward-models-that-think" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">Process Reward Models That Think</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Khalifa et al. [University of Michigan, Mila, LGAIResearch, University of Illinois Urbana-Champaign]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 147 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM PRM </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="using-generative-process-reward-mod" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Using Generative Process Reward Models to Build Better AI Verifiers with Less Data</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Modern AI systems rely heavily on verification to separate correct solutions from plausible-sounding errors. Until now, we have relied on huge labeled datasets or human verifiers, which is extremely slow. Process Reward Models (PRMs) can act as quality inspectors for AI reasoning and check each step in a solution. But training them has required expensive step-by-step annotations. Discriminative PRMs, which classify steps as correct/incorrect, struggle without this data. Meanwhile, zero-shot "LLM-as-a-judge" methods lack reliability, often overcomplicating verification or missing subtle errors. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/ab373e55-5924-47fa-b601-fcf1c0f8bd8b/image.png?t=1745940830" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This paper introduces THINKPRM, a new method that combines generative reasoning with minimal supervision. THINKPRM leverages existing reasoning capabilities in language models more effectively to train generative verifiers using <span style=""><i>1% of the labeled data</i></span> needed by discriminative models while outperforming them. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/222ebb31-42de-4ed7-8d43-1ec051a882d5/image.png?t=1745940716" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="how-thinkprm-works" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">How THINKPRM Works</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> THINKPRM starts with pre-trained reasoning models (like Qwen or Llama variants) and fine-tunes them to generate verification CoTs. Instead of requiring manual annotations for every step, it uses a clever synthetic data pipeline: </p></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ol start="1" style="list-style-type:decimal;margin:0px 0px;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Synthetic Verification Chains</b></span>: A base model critiques solutions, producing step-by-step assessments. These critiques are filtered to match sparse gold labels (e.g., from datasets like PRM800K), ensuring only high-quality examples are kept. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Lightweight Fine-Tuning</b></span>: Models train on these filtered chains, learning to align their verification reasoning with correct judgments. This process takes just hours on a single GPU, even for billion-parameter models. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Scalable Verification</b></span>: At test time, THINKPRM generates multiple verification CoTs per solution. These can be aggregated (parallel scaling) or extended via self-correction prompts (sequential scaling) to boost accuracy. </p></li></ol></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/2e9a0716-b7e9-4b66-b731-cf0468aeafec/image.png?t=1745940772" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="benchmark-results-for-thinkprm" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);font-weight:700;"><b>Benchmark Results for THINKPRM</b></span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Researchers tested THINKPRM across math, coding, and science benchmarks, and it consistently outperformed both discriminative PRMs and zero-shot LLM judges. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/390d3ef2-b73d-4c0a-83a4-61e19b0801c3/image.png?t=1745940899" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>In-Domain Tasks</b></span>: On MATH-500 and AIME’24, THINKPRM-14B achieved up to 8% higher accuracy than discriminative PRMs trained on 100x more data. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Out-of-Domain Generalization</b></span>: Without any domain-specific tuning, THINKPRM surpassed discriminative models by 8% on PhD-level physics questions (GPQA) and 4.5% on code generation (LiveCodeBench). </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Compute Scaling</b></span>: Allowing THINKPRM to generate longer CoTs or aggregate multiple verifications improved results significantly. For example, doubling verification tokens boosted F1 scores by 15 points on ProcessBench, outperforming fixed-length discriminative checks. </p></li></ul></div></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Although THINKPRM is promising, but it isn’t perfect. Its verification chains can be overconfident, and errors in early steps sometimes cascade to later judgments. It also becomes more computationally expensive with longer CoTs, though the performance gains often justify the expense. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/fde28639-81c8-4d7f-841c-ec9e342e5242/image.png?t=1745940926" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28yjf9KIXZdsXoh1WlHvvKlAzVQkfoXYjW2PkqygZoY4Iy0CMZGawmAr6sK8I2-gonN75iKxyLlOel5zIGTyOwvo7RgnD-o9XlLAzByaeZAh/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h17/h001.EtdY8vGNOPC78x8OJQYBrv7TNMx8vlKSeZwU_785Gic" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" style=""><tr><td bgcolor="#222222" style="background-color:#222222;padding:0.0px 0.0px 0.0px 0.0px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"></p></td></tr></table></td></tr></table></td></tr><tr><td id="phy-bench-holistic-evaluation-of-ph" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:Bold;padding:0px 28px;text-align:left;"><h2 style="color:#2A2A2A;font-weight:Bold;">PHYBench: Holistic Evaluation of Physical Perception and Reasoning in Large Language Models</h2></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style=""><i>Qiu et al. [Peking University, Beijing Computational Science Research Center]</i></span></p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"><span style="background-color:#e0e0e0;"><span style="color:rgb(255, 58, 58);font-size:0.6rem;"> ♥ 340 </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> LLM Reasoning </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span><span style="background-color:#e0e0e0;"><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> bycloud’s pick </span></span><span style="color:rgb(44, 129, 229);font-size:0.6rem;"> </span></p></td></tr><tr><td id="master-real-world-physics-reasoning" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Master Real-World Physics Reasoning with PHYBench </span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> AI is getting pretty good at creating basic english sentences and can talk about weather. But it can’t really calculate the tension in a pendulum system after a sudden impact or predict relativistic effects in particle collisions. While today’s LLMs excel at many tasks, their ability to reason about the physical world remains surprisingly limited. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This is because current benchmarks for evaluating AI reasoning often fall short in three ways: they oversimplify problems, rely on abstract math disconnected from real-world scenarios, and use crude metrics like binary accuracy. These limitations make it hard to distinguish between models that genuinely grasp physics and those that merely memorize patterns. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/ad5300a9-3f7c-48eb-9318-b1d051d3af0e/image.png?t=1745941212" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> This paper introduces <span style=""><i>PHYBench</i></span>, a <span style="font-weight:700;"><b>new benchmark</b></span> designed to rigorously test how well AI models understand and reason through physics problems. PHYBench contains 500 carefully curated physics problems spanning mechanics, electromagnetism, thermodynamics, and advanced topics like relativity. Each problem mirrors real-world scenarios and requires complex thinking from calculating string tensions in pendulum systems to analyzing photon-mirror collisions. </p></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> It also introduces the <span style="font-weight:700;"><i><b>Expression Edit Distance (EED) Score</b></i></span>, a metric that measures how “close” a model’s answer is to the correct solution, even when it’s not fully right. This granular approach captures partial understanding, offering a clearer picture of where models stumble. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/280e3a2d-81ed-4d30-b5bd-d5369b914bbb/image.png?t=1745941246" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td id="understanding-the-phy-bench-benchma" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Understanding the PHYBench Benchmark</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Creating a benchmark for physical reasoning requires a good understanding of reality. PHYBench’s questions are adapted from human physics exams and olympiads, to ensure they reflect authentic challenges. For example, one problem asks models to determine the tension in a string connecting three suspended balls after one is struck. This task requires spatial reasoning, force analysis, and multi-step calculations. Each question undergoes rigorous review by physics students to eliminate ambiguities and ensure solvability through pure textual descriptions. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/dcdcb324-ef19-414b-bf50-f1cd9254e242/image.png?t=1745941294" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Traditional metrics like accuracy fail to reward models for getting <span style=""><i>part</i></span> of a solution right. But this benchmark uses EED Score to address this by comparing the structure of a model’s answer to the ground truth using expression trees. If a model forgets a term or miscalculates a coefficient, the EED quantifies how many “edits” (like adding or removing nodes in the tree) would fix the error. For instance, if the correct answer is <span style="color:rgb(24, 128, 56);"><i>T = 2mg + 4mv²/l</i></span> and a model outputs <span style="color:rgb(24, 128, 56);"><i>T = 2mg + 2mv²/l</i></span>, the EED Score reflects this as a minor edit rather than a complete failure. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/7e546467-53ae-41e9-92f2-54ce7a63ef69/image.png?t=1745941495" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr><tr><td align="center" valign="top" class="t" style="width:600px; padding: 4px 0px 4px 0px;"><p>Example questions and errors. The errors are from the solution generated by DeepSeek-R1. Here we demonstrate the main parameters and physical process.</p></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> Even state-of-the-art models often misjudge kinematic relationships or make algebraic slips during long chains of equations. PHYBench forces models to demonstrate both skills, exposing weaknesses that simpler benchmarks miss. </p></td></tr><tr><td id="testing-ai-models-on-real-physics-w" class="dd" align="left" valign="top" style="color:#2A2A2A;font-weight:normal;padding:0px 28px;text-align:left;"><h3 style="color:#2A2A2A;font-weight:normal;"><span style="color:rgb(67, 67, 67);">Testing AI models on Real Physics with PHYBench</span></h3></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> When tested on PHYBench, even top-tier models like Gemini 2.5 Pro and GPT-4o lag far behind human experts. Humans scored 61.9% accuracy and 70.4 EED, while Gemini 2.5 Pro managed just 36.9% accuracy and 49.5 EED. Smaller models fared worse, with some scoring near zero. </p></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/9e6738f9-6105-4c1c-b9c5-2a66c550a8ce/image.png?t=1745941401" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td style="padding-bottom:12px;padding-left:50px;padding-right:40px;padding-top:12px;" class="ee"><div style="margin-left:0px;" class="edm_outlooklist"><ul style="font-weight:normal;list-style-type:disc;margin-bottom:12px !important;margin-top:12px !important;padding:0px 0px 0px 0px;"><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Specialization matters</b></span>: Models fine-tuned for reasoning outperformed general-purpose ones, but none approached human levels. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>Domain disparities</b></span>: All models struggled most with thermodynamics and advanced physics, suggesting gaps in handling multi-step processes or abstract concepts. </p></li><li class="listItem ultext"><p style="line-height:24px;padding:0px;text-align:left;word-break:break-word;"><span style="font-weight:700;"><b>EED’s advantage</b></span>: The EED Score provided 3x more discrimination between models than traditional accuracy, making it a sharper tool for benchmarking progress. </p></li></ul></div></td></tr><tr><td align="center" valign="top" style="padding-bottom:20px;padding-left:28px;padding-right:28px;padding-top:20px; " class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" style="margin:0 auto 0 auto;"><tr><td align="center" valign="top" style="width:600px;"><img src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/uploads/asset/file/bed490c6-18cd-4568-b38b-4fd24c788906/image.png?t=1745941362" alt="" height="auto" width="600" style="display:block;width:100%;" border="0"/></td></tr></table></td></tr><tr><td align="center" valign="top" style="padding-bottom:14px;padding-left:28px;padding-right:28px;padding-top:14px;text-align:center;width:100%;word-break:break-word;" class="dd"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="margin:14px auto 14px auto;"><tr><td align="center" valign="middle" height="44.75" style="height:44.75px;background-color:#2C81E5;border-color:#DFD150;border-radius:10px 10px 10px 10px;border-style:solid;border-width:0px 0px 0px 0px;color:#FFFFFF;font-family:'Open Sans','Segoe UI','Apple SD Gothic Neo','Lucida Grande','Lucida Sans Unicode',sans-serif;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28yjf9KIXZdsXoh1WlHvvKmUp_WdZhKhNdfE3AU2h4CWYJunmoKZPc1K4EjxLl9okCVtQfE9PaCZ1NqAjyKyrPjsjDeootROw58h-_vO1RUE/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h18/h001.9TGSIqmoKa53srV963V6T9a6OZddI6cnZdcmZJsBvVY" target="_blank" rel="noopener noreferrer nofollow" style="color:#FFFFFF;display:block;font-size:16px;font-size:16px;font-weight:normal;padding:0px 14px;padding:14px 14px 14px 14px;text-decoration:none;"> Read Full Paper </a></td></tr></table></td></tr><tr><td class="dd" style="padding: 20px;"><table width="100%" cellpadding="0" cellspacing="0" role="none" style="max-width:520px;margin:0 auto;"><tr><td class="q" style="padding:16px 16px 6px 16px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.tLfGW26lAwaS9gFg17HSoDDFT6eh5Nsg0xYVQj-h6I3o9m2k79_qw4izMYhmcI36lDujfetHNCFYTsCl4P_tJ8RO4z6BVzdxSsFjWf0B4EE7Sr82il9xbbFC-i4R6a8lXuIeaT_b1DnjXWPN6aY7jhKpYL_8Deu9ehD-XxGT4LQ/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h19/h001.OBz6EExM9H-JE4UWSWqtB0hGLkaHLm0coBoKTCjo3-s" style="text-decoration:none !important;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="100%" style="padding: 0 0 14px 0;text-decoration:none;width:100%;"><table width="100%" cellpadding="0" cellspacing="0" border="0" role="none"><tr><td width="36" style="width:36px;"><img src="https://pbs.twimg.com/profile_images/1698572487909400576/BvncwnrP_normal.jpg" alt="tw profile: The AI Timeline" style="display:block;width:36px;height:36px;border-radius:50%;border:0;"/></td><td width="400" style="padding:0 0 0 8px;text-decoration:none;"><span style="display:block;font-size:14px;color:#1c2022;font-weight:700;"> The AI Timeline </span><span style="display:block;color:#697882;font-size:14px;"> @TheAITimeline </span></td><td width="24" align="right" style="vertical-align:text-top;"><img width="24" height="24" loading="lazy" alt="tw" style="border:0;" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_logo.png"/></td></tr></table></td></tr><tr></tr><tr><td style="word-break:break-word;"><p>🚨This week's top AI/ML research papers:</p><p>- Test-Time RL <br>- PHYBench <br>- Process Reward Models That Think <br>- Tiny Reasoning Models via LoRA <br>- Learning to Reason under Off-Policy Guidance <br>- SplitReason <br>- Learning Adaptive Parallel Reasoning with LMs <br>- Token-Shuffle <br>- Describe Anything</p></td></tr><tr><td style="padding:12px 0 0 0;"></td></tr><tr><td align="center" style="padding:8px 0 0 0;width:480px;"><img src="https://pbs.twimg.com/media/Gpj6w5wXgAA1Qb2.jpg" width="480" height="auto" style="display:block;border:1px solid #E1E8ED;border-radius:5px;width:100%;max-width:480px;height:auto;"/></td></tr><tr><td height="8" style="line-height:1px;font-size:1px;height:8px;"> </td></tr><tr><td align="left" valign="top" class="s"><p>6:29 PM • Apr 27, 2025</p></td></tr><tr><td height="10" style="line-height: 1px; font-size: 1px; height: 10px;"> </td></tr><tr><td height="1" bgcolor="#e1e8ed" style="line-height:0px;font-size:0px;height:1px;"></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td align="left" valign="top" class="s"><p><b style="color:#1C2022">555</b> Likes <b style="color:#1C2022">89</b> Retweets </p></td></tr><tr><td align="left" valign="top" class="s"><div align="center" style="text-align:center;margin-top:4px;margin-bottom:4px;padding:8px;border:1px solid #ccd6dd;border-radius:9999px;color:#1B95E0"><b>3 Replies</b></div></td></tr></table></a></td></tr></table></td></tr><tr><td class="dd" align="left" style="padding:0px 28px;text-align:left;word-break:break-word;"><p style="line-height:24px;"> include a link to the new premium posts and subscription module </p></td></tr></table></td></tr></table></td></tr><tr><td align="center" valign="top"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><tr><td class="b" align="center" valign="top" bgcolor="#2a2a2a" style="padding:0px 0px 0px 0px;border-style:solid;border-width: 0px 0px 0px 0px;border-color: #2a2a2a;border-bottom-left-radius:10px;border-bottom-right-radius:10px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top" bgcolor="#73ddff" style="padding:12px"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td><span style="padding-left:1px;"></span></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.1muhFWIqieRYpaJ-FbWSCQqcWoV4NNHHr5SkP9THApWuHAAlWLQxI3Q_IqFmt_DcyAxeC8jDApCnHmMSBGpBb5sgtimvBYgxRX-Rp7s0F3LjCHoSwdhr83OBqRFhJ1y_/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h20/h001.uuiO49Gyotcy6gb5xmuMg_wzkAxiC8Ijr29V3NAEqic" style="text-decoration:none;"><img width="22" height="22" alt="tw" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/x_dark.png"/></a></td><td align="center" valign="middle" width="75" style="width:75px;"><a href="https://elink4f7.mail.bycloud.ai/ss/c/u001.amatuKKICSickUKplYJXmBoQnQ9VXnB2zTxBG4HeHBgjMqVxpoXRdj01cjwyoVlHgiebEOgBvwHtevoVpsSvpn3Q1di2ml6sb3cBM-X6IStQbj_zQSVGWJ8AAmPw2en2/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h21/h001.71TFS165AEF0SO3SekKC5SojMDXlqYB6AKbgUTb5dfs" style="text-decoration:none;"><img width="22" height="16" alt="yt" border="0" style="display:block;max-width:22px;color:Dark" src="https://media.beehiiv.com/cdn-cgi/image/fit=scale-down,format=auto,onerror=redirect,quality=80/static_assets/youtube_dark.png"/></a></td><td><span style="padding-left:1px;"></span></td></tr></table></td></tr><tr><td height="10" style="line-height:1px;font-size:1px;height:10px;"> </td></tr><tr><td class="w" align="center" valign="top" style="padding:15px 15px 15px 15px;"><table role="none" width="100%" border="0" cellspacing="0" cellpadding="0" align="center"><tr><td align="center" valign="top"><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> Update your email preferences or unsubscribe <a class="link" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.c6q0w4g5sodbtO4I1B_pxWc4htTObwdorovK0nFHVH-4pUdVE0ELYH5DsNemk732SjNwhPNJ25r0O8B5vYifsBhEpz-DJgyVFmavJPa0OyKRRnvw4o7XGyvIv7PRofnmc2O_a5G9a6HaWcNhWKxTxwERAgpekoK8Z_sHzOAzGvALmYtenJNRKvILu4iDkEqG8RIyMNZp1C46js8c8oNPB6kOVYB3jdT0R52qj7iyUBmEQ0_f1L5p6mziIjQtNfcinJASrP8h64eNLyHNa_R8mV98LxV8U2po-tfE8-AVaPDUssXxymdk7zFRZrcek7UIjfgNyykarLStaCZsvEKsxT3q_v80Cp9Hu5PPsIfiUGw_KXfCP5wn8xSAr75Y5r0zcnsVMGgPwPbQe4M76yzyjid6ihUGQVUezUeYez4GNrKgVHdB6qTMobBCbJDLlCHkBYCI9dmMOHdTYwbrkf8-DUKM02BX_T9X_bDiuhOEgj6DXwPWXOrYosMA3R97W1QzDpJJ_XozxHEx_oewXxtUCX8I1J32ty5sPERn4YX92g0duKTNOfkWPwkp6PcGP9_9zIC2D89b04FPB-IGE9PiCS2-JqfhQ7lasLchP9ZJh9PFlXunrQlrRqU3S5vrzY313aZETt5K8ChK3Jw6VJwC4MgjW7-USKTmKyuvGr7zVzrYboon5Bl1toR3b1CvBtHjSx6jerImNzVwe87BHeA4PFdeiHDIM89EquSyjFdUoYqU9mUeshjU9d-ae9icZ6iGEJ5iOE2ZMb4-jXjQ3nVUvRxoPopETAvoVEEPsB_XkJzIKQah6RrGLrrAwy1bE77EE4ZDZi41C49_nAynU9ZDlA/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h22/h001.FB0G-GpV2ki7dYlKqMk2zuojt7AJy4JfxZZ4rfp8chE" style="text-decoration:underline;text-decoration-color:#FFFFFF!important;color:#FFFFFF!important;"> here</a></p><p class="copyright" style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> © 2025 bycloudai </p><p style="font-family:'Verdana',Geneva,sans-serif;color:#FFFFFF!important;"> 228 Park Ave S, #29976, New York, New York 10003, United States </p></td></tr><tr style="display: table-row !important;"><td align="center" valign="top" style="padding-top:20px;" style="display:table-cell !important;"><table role="none" border="0" cellspacing="0" cellpadding="0" align="center" style="display:table !important;"><tr style="display:table-row !important;"><td class="u" align="center" valign="middle" height="32" style="height:32px;display:table-cell !important; max-height: 32px !important;margin:0px !important; background-color: #ffffff !important;"><a style="line-height:32px !important;text-decoration:none;display:block !important;" href="https://elink4f7.mail.bycloud.ai/ss/c/u001.DUiN96-Eq7pUHzwEhy5j28olDWFpV5DDKfdk_OdOKOgzOy5dup7tIBxDiwzoiSHg9yAYU1lTkiWEQGauSYwfRwfKi9fJZBqPIg4zTL70G7jrdzIK5-kpcJgvVh_Sisv-lnhy27i0eR7ukLw-4Fvz946AycgCIiB13mhWrOdpPu7mLczFEgTSlWuM-pQ3DT-y_wsXZGQIept4zDYvK2BM8HUtnf2eF5IgtKhsob3kCVbQL33T8E9xvXfheRIkkgae/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h23/h001.alePRc5U0e_ylSmkCxwYGVAkx3_JOXE_9xgmGVyHDLg"><img src="https://media.beehiiv.com/output-onlinepngtools.png" width="16" alt="beehiiv logo" style="display:inline-block !important;max-width:16px !important; vertical-align:-3px !important;width: 16px !important;" border="0"/><span style="padding-left:11px !important;display: inline-block !important;">Powered by beehiiv</span></a></td></tr></table></td></tr><tr><td align="left" valign="top" height="2" style="height:2px;"><a href='https://elink4f7.mail.bycloud.ai/ss/c/u001.CxDkkVpJsBdVoe83c_tBWsHIaP4XNp0WgUYqLvHcKk_3uqk_KIkz4ddLinhFbud6JuxLFdSUhYnR7b1NSsmbtzXNGNblnEEMKUtkCAjkn8Y/4g2/dNb_B7-3TOqMXUk2Pq45KQ/h24/h001.f1Mt1ySet3SUfTFefhG3E2ac2IsIwxCRa2vtiQGPWFE' style="color: #2a2a2a !important; cursor: default; font-size: 1px; text-decoration: none;"> Terms of Service </a></td></tr></table></td></tr></table></td></tr></td></tr></table></td></tr></table></td></tr></table></td></tr></table></div></body></html>