{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:UDUU2MOVI6KVDDTYFJTQDAJHDU","short_pith_number":"pith:UDUU2MOV","schema_version":"1.0","canonical_sha256":"a0e94d31d54795518e782a670181271d2d9f6da5b6d1a9bff976e84e5b7c29df","source":{"kind":"arxiv","id":"2510.26384","version":2},"attestation_state":"computed","paper":{"title":"Scales++: Compute Efficient Evaluation Subset Selection with Cognitive Scales Embeddings","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Andrew M. Bean, Jonathan Richard Schwarz, Nabeel Seedat, Shengzhuang Chen","submitted_at":"2025-10-30T11:28:58Z","abstract_excerpt":"The prohibitive cost of evaluating large language models (LLMs) on comprehensive benchmarks necessitates the creation of small yet representative data subsets (i.e., tiny benchmarks) that enable efficient assessment while retaining predictive fidelity. Current methods for this task operate under a model-centric paradigm, selecting benchmarking items based on the collective performance of existing models. Such approaches are limited by large upfront costs, an inability to immediately handle new benchmarks (\"cold-start\"), and the fragile assumption that future models will share the failure patte"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.26384","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-10-30T11:28:58Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"2ab7fc16618b56181159d2cdd1b801059fa6fe67c7bc0d9f4dd5e658f389a9f6","abstract_canon_sha256":"29d75e57a332bdea892d43d4a7fdf4a0d53361a051416b349a6f9b2c61b530aa"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:04:16.491373Z","signature_b64":"MI+vC+Lf043EmauC6rwE1fGQihP1bQTTZd3I8BI44Xbu4OzR+Ys8FA9FPkZsMtr829WYeiBL+TnaDIvjWqE4CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a0e94d31d54795518e782a670181271d2d9f6da5b6d1a9bff976e84e5b7c29df","last_reissued_at":"2026-05-20T00:04:16.490647Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:04:16.490647Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Scales++: Compute Efficient Evaluation Subset Selection with Cognitive Scales Embeddings","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Andrew M. Bean, Jonathan Richard Schwarz, Nabeel Seedat, Shengzhuang Chen","submitted_at":"2025-10-30T11:28:58Z","abstract_excerpt":"The prohibitive cost of evaluating large language models (LLMs) on comprehensive benchmarks necessitates the creation of small yet representative data subsets (i.e., tiny benchmarks) that enable efficient assessment while retaining predictive fidelity. Current methods for this task operate under a model-centric paradigm, selecting benchmarking items based on the collective performance of existing models. Such approaches are limited by large upfront costs, an inability to immediately handle new benchmarks (\"cold-start\"), and the fragile assumption that future models will share the failure patte"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.26384","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.26384/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.26384","created_at":"2026-05-20T00:04:16.490751+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.26384v2","created_at":"2026-05-20T00:04:16.490751+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.26384","created_at":"2026-05-20T00:04:16.490751+00:00"},{"alias_kind":"pith_short_12","alias_value":"UDUU2MOVI6KV","created_at":"2026-05-20T00:04:16.490751+00:00"},{"alias_kind":"pith_short_16","alias_value":"UDUU2MOVI6KVDDTY","created_at":"2026-05-20T00:04:16.490751+00:00"},{"alias_kind":"pith_short_8","alias_value":"UDUU2MOV","created_at":"2026-05-20T00:04:16.490751+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2603.06610","citing_title":"CapTrack: Multifaceted Evaluation of Forgetting in LLM Post-Training","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2603.06610","citing_title":"CapTrack: Multifaceted Evaluation of Forgetting in LLM Post-Training","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07096","citing_title":"Query-efficient model evaluation using cached responses","ref_index":148,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU","json":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU.json","graph_json":"https://pith.science/api/pith-number/UDUU2MOVI6KVDDTYFJTQDAJHDU/graph.json","events_json":"https://pith.science/api/pith-number/UDUU2MOVI6KVDDTYFJTQDAJHDU/events.json","paper":"https://pith.science/paper/UDUU2MOV"},"agent_actions":{"view_html":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU","download_json":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU.json","view_paper":"https://pith.science/paper/UDUU2MOV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.26384&json=true","fetch_graph":"https://pith.science/api/pith-number/UDUU2MOVI6KVDDTYFJTQDAJHDU/graph.json","fetch_events":"https://pith.science/api/pith-number/UDUU2MOVI6KVDDTYFJTQDAJHDU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU/action/storage_attestation","attest_author":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU/action/author_attestation","sign_citation":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU/action/citation_signature","submit_replication":"https://pith.science/pith/UDUU2MOVI6KVDDTYFJTQDAJHDU/action/replication_record"}},"created_at":"2026-05-20T00:04:16.490751+00:00","updated_at":"2026-05-20T00:04:16.490751+00:00"}