{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PXTAGQFVTJREVOXA76QIXZFL7D","short_pith_number":"pith:PXTAGQFV","schema_version":"1.0","canonical_sha256":"7de60340b59a624abae0ffa08be4abf8db8eb249d44960916a6a2fc4e5a542ac","source":{"kind":"arxiv","id":"2605.29384","version":1},"attestation_state":"computed","paper":{"title":"Latent Terms: Dense Retrievers Contain Trivially Extractable BM25-ready Zipfian Vocabularies","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.IR","authors_text":"Aamir Shakir, Benjamin Clavi\\'e, Makoto P. Kato, Sean Lee","submitted_at":"2026-05-28T05:36:37Z","abstract_excerpt":"We propose Latent Terms, a method revealing that models trained for dense retrieval, whether single- or multi-vector, learn representations that can trivially be decomposed into retrieval-ready sparse features. When trained on frozen retrievers, Sparse Autoencoders without any retrieval-specific adjustments extract a latent vocabulary with approximately Zipfian collection statistics, directly suitable for classical sparse retrieval scoring via BM25. This approach enables sparse retrieval while requiring no learned expansion objective or sparse retrieval supervision whatsoever, and can be readi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.29384","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.IR","submitted_at":"2026-05-28T05:36:37Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"8f826f9fab4c347129f2e468e28851290ac6bc8dc4a3e415efa461b78ee96269","abstract_canon_sha256":"2ba214ccbf20d93462e073576dcb62a8655510ab9e09a36180f1d6e57d48ccb0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:05:36.346729Z","signature_b64":"tC1nM5ForypdOlQRFL0VhgtWJazUiIY+V3k9cwU96X5o3dw+eZUG/Trqtp77oeOPv4eA7BidQgbOSaLXbS5vAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7de60340b59a624abae0ffa08be4abf8db8eb249d44960916a6a2fc4e5a542ac","last_reissued_at":"2026-05-29T01:05:36.346209Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:05:36.346209Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Latent Terms: Dense Retrievers Contain Trivially Extractable BM25-ready Zipfian Vocabularies","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.IR","authors_text":"Aamir Shakir, Benjamin Clavi\\'e, Makoto P. Kato, Sean Lee","submitted_at":"2026-05-28T05:36:37Z","abstract_excerpt":"We propose Latent Terms, a method revealing that models trained for dense retrieval, whether single- or multi-vector, learn representations that can trivially be decomposed into retrieval-ready sparse features. When trained on frozen retrievers, Sparse Autoencoders without any retrieval-specific adjustments extract a latent vocabulary with approximately Zipfian collection statistics, directly suitable for classical sparse retrieval scoring via BM25. This approach enables sparse retrieval while requiring no learned expansion objective or sparse retrieval supervision whatsoever, and can be readi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.29384","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.29384/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.29384","created_at":"2026-05-29T01:05:36.346291+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.29384v1","created_at":"2026-05-29T01:05:36.346291+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.29384","created_at":"2026-05-29T01:05:36.346291+00:00"},{"alias_kind":"pith_short_12","alias_value":"PXTAGQFVTJRE","created_at":"2026-05-29T01:05:36.346291+00:00"},{"alias_kind":"pith_short_16","alias_value":"PXTAGQFVTJREVOXA","created_at":"2026-05-29T01:05:36.346291+00:00"},{"alias_kind":"pith_short_8","alias_value":"PXTAGQFV","created_at":"2026-05-29T01:05:36.346291+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D","json":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D.json","graph_json":"https://pith.science/api/pith-number/PXTAGQFVTJREVOXA76QIXZFL7D/graph.json","events_json":"https://pith.science/api/pith-number/PXTAGQFVTJREVOXA76QIXZFL7D/events.json","paper":"https://pith.science/paper/PXTAGQFV"},"agent_actions":{"view_html":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D","download_json":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D.json","view_paper":"https://pith.science/paper/PXTAGQFV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.29384&json=true","fetch_graph":"https://pith.science/api/pith-number/PXTAGQFVTJREVOXA76QIXZFL7D/graph.json","fetch_events":"https://pith.science/api/pith-number/PXTAGQFVTJREVOXA76QIXZFL7D/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D/action/storage_attestation","attest_author":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D/action/author_attestation","sign_citation":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D/action/citation_signature","submit_replication":"https://pith.science/pith/PXTAGQFVTJREVOXA76QIXZFL7D/action/replication_record"}},"created_at":"2026-05-29T01:05:36.346291+00:00","updated_at":"2026-05-29T01:05:36.346291+00:00"}