{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:KQ5RITZ4EURM7LHALWYVZHLXXA","short_pith_number":"pith:KQ5RITZ4","schema_version":"1.0","canonical_sha256":"543b144f3c2522cface05db15c9d77b82e18635b4efd21f118dd0e05afdded29","source":{"kind":"arxiv","id":"2606.12209","version":1},"attestation_state":"computed","paper":{"title":"Interpretable enzyme function prediction via sparse autoencoder features of ESMC across the microbial protein universe","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"q-bio.QM","authors_text":"Junqing Wang, Wanyu Cheng, Yingchao Liu, Yue Hu","submitted_at":"2026-06-10T15:27:40Z","abstract_excerpt":"Microbial genomes and metagenomes contain millions of proteins whose enzymatic functions remain unknown, the enzyme dark matter. While deep learning has improved protein function prediction, most methods are black boxes relying on sequence or structural similarity, limiting discovery of novel catalytic activities. The ESMC-6B protein language model and its sparse autoencoder with a 16,384-dimensional codebook of interpretable biological concepts, each annotated by GPT-5, creates a new opportunity: using these features directly as semantic signatures for enzyme function. Here, we show that ESMC"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.12209","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"q-bio.QM","submitted_at":"2026-06-10T15:27:40Z","cross_cats_sorted":[],"title_canon_sha256":"be449adb9bddd7383d0ef87868fe5c5474d6274b3345696163cf7b44ae6a7f67","abstract_canon_sha256":"e0f7e485faac37beb765bd7883fcb62f540ab8f933e0887be69142b27e728b14"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-11T01:10:54.470984Z","signature_b64":"RLpyZSntPzsoCM+AbOoz4KDIgbKJzTOTu1fKRFYiriGEMRv+ADQcRLwyatOMpECto0bavoPwtBonedMF8tuaAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"543b144f3c2522cface05db15c9d77b82e18635b4efd21f118dd0e05afdded29","last_reissued_at":"2026-06-11T01:10:54.470027Z","signature_status":"signed_v1","first_computed_at":"2026-06-11T01:10:54.470027Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Interpretable enzyme function prediction via sparse autoencoder features of ESMC across the microbial protein universe","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"q-bio.QM","authors_text":"Junqing Wang, Wanyu Cheng, Yingchao Liu, Yue Hu","submitted_at":"2026-06-10T15:27:40Z","abstract_excerpt":"Microbial genomes and metagenomes contain millions of proteins whose enzymatic functions remain unknown, the enzyme dark matter. While deep learning has improved protein function prediction, most methods are black boxes relying on sequence or structural similarity, limiting discovery of novel catalytic activities. The ESMC-6B protein language model and its sparse autoencoder with a 16,384-dimensional codebook of interpretable biological concepts, each annotated by GPT-5, creates a new opportunity: using these features directly as semantic signatures for enzyme function. Here, we show that ESMC"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.12209","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.12209/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.12209","created_at":"2026-06-11T01:10:54.470161+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.12209v1","created_at":"2026-06-11T01:10:54.470161+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.12209","created_at":"2026-06-11T01:10:54.470161+00:00"},{"alias_kind":"pith_short_12","alias_value":"KQ5RITZ4EURM","created_at":"2026-06-11T01:10:54.470161+00:00"},{"alias_kind":"pith_short_16","alias_value":"KQ5RITZ4EURM7LHA","created_at":"2026-06-11T01:10:54.470161+00:00"},{"alias_kind":"pith_short_8","alias_value":"KQ5RITZ4","created_at":"2026-06-11T01:10:54.470161+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA","json":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA.json","graph_json":"https://pith.science/api/pith-number/KQ5RITZ4EURM7LHALWYVZHLXXA/graph.json","events_json":"https://pith.science/api/pith-number/KQ5RITZ4EURM7LHALWYVZHLXXA/events.json","paper":"https://pith.science/paper/KQ5RITZ4"},"agent_actions":{"view_html":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA","download_json":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA.json","view_paper":"https://pith.science/paper/KQ5RITZ4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.12209&json=true","fetch_graph":"https://pith.science/api/pith-number/KQ5RITZ4EURM7LHALWYVZHLXXA/graph.json","fetch_events":"https://pith.science/api/pith-number/KQ5RITZ4EURM7LHALWYVZHLXXA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA/action/storage_attestation","attest_author":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA/action/author_attestation","sign_citation":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA/action/citation_signature","submit_replication":"https://pith.science/pith/KQ5RITZ4EURM7LHALWYVZHLXXA/action/replication_record"}},"created_at":"2026-06-11T01:10:54.470161+00:00","updated_at":"2026-06-11T01:10:54.470161+00:00"}