{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XYKCYTCWV7ZWQ464H6KKTGGY4I","short_pith_number":"pith:XYKCYTCW","schema_version":"1.0","canonical_sha256":"be142c4c56aff36873dc3f94a998d8e21c356951dd9134dd5b3434322094790e","source":{"kind":"arxiv","id":"2605.18607","version":1},"attestation_state":"computed","paper":{"title":"Forecasting Downstream Performance of LLMs With Proxy Metrics","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Arkil Patel, Dzmitry Bahdanau, Marius Mosbach, Siva Reddy","submitted_at":"2026-05-18T16:17:15Z","abstract_excerpt":"Progress in language model development is often driven by comparative decisions: which architecture to adopt, which pretraining corpus to use, or which training recipe to apply. Making these decisions well requires reliable performance forecasts, yet the two commonly used signals are fundamentally limited. Cross-entropy loss is poorly aligned with downstream capabilities, and direct downstream evaluation is expensive, sparse, and often uninformative at early training stages. Instead, we propose to construct proxy metrics by aggregating token-level statistics, such as entropy, top-k accuracy, a"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.18607","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-18T16:17:15Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"9e42e53ba89737a3b6a63687b18e02f6cb21be5d6cc5e14b377ac05ed5195c78","abstract_canon_sha256":"1ee2220e1b835e32c44fe87fde6e49375e91105350c28a25d87dfddc6f000882"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:06:10.624548Z","signature_b64":"evdGQlLUgSVNjubhpCuofg/QAwL0U/jpVHiydjAk/Idl2crgpMnZbnVZRXh6FFKQHT02+MpYFniyAzVdlg9hDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"be142c4c56aff36873dc3f94a998d8e21c356951dd9134dd5b3434322094790e","last_reissued_at":"2026-05-20T00:06:10.623661Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:06:10.623661Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Forecasting Downstream Performance of LLMs With Proxy Metrics","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"Arkil Patel, Dzmitry Bahdanau, Marius Mosbach, Siva Reddy","submitted_at":"2026-05-18T16:17:15Z","abstract_excerpt":"Progress in language model development is often driven by comparative decisions: which architecture to adopt, which pretraining corpus to use, or which training recipe to apply. Making these decisions well requires reliable performance forecasts, yet the two commonly used signals are fundamentally limited. Cross-entropy loss is poorly aligned with downstream capabilities, and direct downstream evaluation is expensive, sparse, and often uninformative at early training stages. Instead, we propose to construct proxy metrics by aggregating token-level statistics, such as entropy, top-k accuracy, a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.18607","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.18607/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T00:01:59.243642Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"5145f996d8a7088c1abe8dfd88b63c61268775d3c3abd99dce9392b687660da9"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.18607","created_at":"2026-05-20T00:06:10.623799+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.18607v1","created_at":"2026-05-20T00:06:10.623799+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.18607","created_at":"2026-05-20T00:06:10.623799+00:00"},{"alias_kind":"pith_short_12","alias_value":"XYKCYTCWV7ZW","created_at":"2026-05-20T00:06:10.623799+00:00"},{"alias_kind":"pith_short_16","alias_value":"XYKCYTCWV7ZWQ464","created_at":"2026-05-20T00:06:10.623799+00:00"},{"alias_kind":"pith_short_8","alias_value":"XYKCYTCW","created_at":"2026-05-20T00:06:10.623799+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I","json":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I.json","graph_json":"https://pith.science/api/pith-number/XYKCYTCWV7ZWQ464H6KKTGGY4I/graph.json","events_json":"https://pith.science/api/pith-number/XYKCYTCWV7ZWQ464H6KKTGGY4I/events.json","paper":"https://pith.science/paper/XYKCYTCW"},"agent_actions":{"view_html":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I","download_json":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I.json","view_paper":"https://pith.science/paper/XYKCYTCW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.18607&json=true","fetch_graph":"https://pith.science/api/pith-number/XYKCYTCWV7ZWQ464H6KKTGGY4I/graph.json","fetch_events":"https://pith.science/api/pith-number/XYKCYTCWV7ZWQ464H6KKTGGY4I/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I/action/storage_attestation","attest_author":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I/action/author_attestation","sign_citation":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I/action/citation_signature","submit_replication":"https://pith.science/pith/XYKCYTCWV7ZWQ464H6KKTGGY4I/action/replication_record"}},"created_at":"2026-05-20T00:06:10.623799+00:00","updated_at":"2026-05-20T00:06:10.623799+00:00"}