{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:YKQZDIQ3HBRUY4IARCBELST6B3","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fe146792818b3478d278f05cbcc09f4bec68d135d66cbb89666e7b1da47364b3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T05:45:54Z","title_canon_sha256":"75dfe0046e564b7eb28ab56df1838b9d494fe63fdbc1e7112fa72ddff8666343"},"schema_version":"1.0","source":{"id":"2605.16823","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.16823","created_at":"2026-05-20T00:03:24Z"},{"alias_kind":"arxiv_version","alias_value":"2605.16823v1","created_at":"2026-05-20T00:03:24Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16823","created_at":"2026-05-20T00:03:24Z"},{"alias_kind":"pith_short_12","alias_value":"YKQZDIQ3HBRU","created_at":"2026-05-20T00:03:24Z"},{"alias_kind":"pith_short_16","alias_value":"YKQZDIQ3HBRUY4IA","created_at":"2026-05-20T00:03:24Z"},{"alias_kind":"pith_short_8","alias_value":"YKQZDIQ3","created_at":"2026-05-20T00:03:24Z"}],"graph_snapshots":[{"event_id":"sha256:33956393850c724287dd885df51f12ba759d64efeba0b7b29cb3a1aea06c0df0","target":"graph","created_at":"2026-05-20T00:03:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results show that VQ-Atom consistently improves predictive performance compared to conventional tokenization approaches in protein-ligand interaction prediction under a protein-cold split setting without relying on 3D structural information."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the codebook entries learned via vector quantization on GNN embeddings correspond to chemically meaningful atomic contexts that are relevant to the downstream prediction task and generalize beyond the training distribution."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VQ-Atom discretizes continuous GNN atom embeddings into chemically meaningful discrete tokens via vector quantization to improve molecular language modeling for downstream chemistry tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Vector quantization on atom embeddings yields discrete tokens for chemical contexts that boost protein-ligand prediction."}],"snapshot_sha256":"5cf07f9c28a02b285109ab2e258cea4f3f01a891f9e8045d53335515bdc58e6f"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"aee59022026b7f1dd64a1c4d0606c8a2b78fe458d93fd34d6e23fdbf678b9ba2"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T21:01:26.461512Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T21:01:19.266647Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T19:01:56.266042Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.408143Z","status":"skipped","version":"1.0.0"}],"endpoint":"/pith/2605.16823/integrity.json","findings":[],"snapshot_sha256":"7e0454c061d036915a39117c1ce0fbf0c4e0bc4bfa78649d085fa84b07a660c2","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Molecular representation learning has become a central approach in AI-driven drug discovery, yet existing molecular tokenizations such as SMILES remain largely syntactic and do not naturally align with chemically meaningful substructures. In this work, we introduce VQ-Atom, a semantic discretization framework that converts continuous atom-level graph representations into discrete tokens corresponding to local chemical environments. Using graph neural network embeddings and vector quantization, atoms are assigned to codebook entries representing chemically meaningful atomic contexts. These disc","authors_text":"Takayuki Kimura","cross_cats":[],"headline":"Vector quantization on atom embeddings yields discrete tokens for chemical contexts that boost protein-ligand prediction.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T05:45:54Z","title":"Atoms as Language: VQ-Atom: Semantic Discretization for Molecular Representation Learning"},"references":{"count":15,"internal_anchors":0,"resolved_work":15,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Bert: Pre-training of deep bidirectional transformers for language understanding","work_id":"c9277a7f-e03f-4299-a26f-8f1f0af57ff8","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Language models are few-shot learners.NeurIPS, 2020","work_id":"b792da6d-d41a-40bc-84d7-40b3b9f0d365","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Neural machine translation of rare words with subword units","work_id":"52274568-20ab-422d-b768-2f7fd6f52dbe","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Smiles, a chemical language and information system","work_id":"8224d9f6-ca3b-4583-abc0-87dd34758f1f","year":1988},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Shortcut learning in deep neural networks.Nature Machine Intelligence, 2020","work_id":"1a37e89e-1c36-4193-a69a-846039769be1","year":2020}],"snapshot_sha256":"97813c397508c299a66025647247792983884be29bbff28b847ea682285aa766"},"source":{"id":"2605.16823","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T20:55:05.744812Z","id":"3e30f4e1-b31f-46df-9ffc-57a33cd9fdd1","model_set":{"reader":"grok-4.3"},"one_line_summary":"VQ-Atom discretizes continuous GNN atom embeddings into chemically meaningful discrete tokens via vector quantization to improve molecular language modeling for downstream chemistry tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Vector quantization on atom embeddings yields discrete tokens for chemical contexts that boost protein-ligand prediction.","strongest_claim":"Experimental results show that VQ-Atom consistently improves predictive performance compared to conventional tokenization approaches in protein-ligand interaction prediction under a protein-cold split setting without relying on 3D structural information.","weakest_assumption":"That the codebook entries learned via vector quantization on GNN embeddings correspond to chemically meaningful atomic contexts that are relevant to the downstream prediction task and generalize beyond the training distribution."}},"verdict_id":"3e30f4e1-b31f-46df-9ffc-57a33cd9fdd1"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:803b2f31636e61678f2e1e11f0a1364263a42b6964cd372c0c91af5febdbed59","target":"record","created_at":"2026-05-20T00:03:24Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fe146792818b3478d278f05cbcc09f4bec68d135d66cbb89666e7b1da47364b3","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-16T05:45:54Z","title_canon_sha256":"75dfe0046e564b7eb28ab56df1838b9d494fe63fdbc1e7112fa72ddff8666343"},"schema_version":"1.0","source":{"id":"2605.16823","kind":"arxiv","version":1}},"canonical_sha256":"c2a191a21b38634c7100888245ca7e0ed1ca381a5d26119b6188b72df2a4ab11","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c2a191a21b38634c7100888245ca7e0ed1ca381a5d26119b6188b72df2a4ab11","first_computed_at":"2026-05-20T00:03:24.471740Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:03:24.471740Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"oK3QVNwshrRvKNc0GCxeEZjC+CB/fc41zkf+vcfOYbIR7ZzmUnE4e7FB1qAir4at3xQyl6pvn0R5gHZf6i5VCw==","signature_status":"signed_v1","signed_at":"2026-05-20T00:03:24.472591Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.16823","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:803b2f31636e61678f2e1e11f0a1364263a42b6964cd372c0c91af5febdbed59","sha256:33956393850c724287dd885df51f12ba759d64efeba0b7b29cb3a1aea06c0df0"],"state_sha256":"e1209f75660f4d4ba16ca534d520e37845e2e132b29c8705b8567e1d712416f8"}