{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:JKFNN7TLO6AEIXZSJ4AW7MFCLK","short_pith_number":"pith:JKFNN7TL","schema_version":"1.0","canonical_sha256":"4a8ad6fe6b7780445f324f016fb0a25a982dd02e8a5b86a55f28dafa4c1f9be6","source":{"kind":"arxiv","id":"2301.04558","version":2},"attestation_state":"computed","paper":{"title":"Learning to Exploit Temporal Structure for Biomedical Vision-Language Processing","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Aditya Nori, Anja Thieme, Anton Schwaighofer, Benedikt Boecking, Daniel C. Castro, Fernando P\\'erez-Garc\\'ia, Harshita Sharma, Javier Alvarez-Valle, Kenza Bouzid, Maria Wetscherek, Matthew P. Lungren, Maximilian Ilse, Ozan Oktay, Qianchu Liu, Shruthi Bannur, Stephanie Hyland","submitted_at":"2023-01-11T16:35:33Z","abstract_excerpt":"Self-supervised learning in vision-language processing exploits semantic alignment between imaging and text modalities. Prior work in biomedical VLP has mostly relied on the alignment of single image and report pairs even though clinical notes commonly refer to prior images. This does not only introduce poor alignment between the modalities but also a missed opportunity to exploit rich self-supervision through existing temporal content in the data. In this work, we explicitly account for prior images and reports when available during both training and fine-tuning. Our approach, named BioViL-T,"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2301.04558","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-01-11T16:35:33Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3789f9fbc1475030f2763797d2aa722717cd430116948e23bdc1cf41299927ae","abstract_canon_sha256":"e8d8be7f8bd5767da082959d975eda57f5e802ffed92a2420d22c2c895b1a938"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T07:47:55.060183Z","signature_b64":"0mPsqcU3AgM9s1zr8s0CSPgxHop2RQoWmGs9OEWyXvA8XkyYyIrrxsWc78jrQHWRbeYYTeBHEqpMbWER3pMiAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4a8ad6fe6b7780445f324f016fb0a25a982dd02e8a5b86a55f28dafa4c1f9be6","last_reissued_at":"2026-07-05T07:47:55.059706Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T07:47:55.059706Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Learning to Exploit Temporal Structure for Biomedical Vision-Language Processing","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Aditya Nori, Anja Thieme, Anton Schwaighofer, Benedikt Boecking, Daniel C. Castro, Fernando P\\'erez-Garc\\'ia, Harshita Sharma, Javier Alvarez-Valle, Kenza Bouzid, Maria Wetscherek, Matthew P. Lungren, Maximilian Ilse, Ozan Oktay, Qianchu Liu, Shruthi Bannur, Stephanie Hyland","submitted_at":"2023-01-11T16:35:33Z","abstract_excerpt":"Self-supervised learning in vision-language processing exploits semantic alignment between imaging and text modalities. Prior work in biomedical VLP has mostly relied on the alignment of single image and report pairs even though clinical notes commonly refer to prior images. This does not only introduce poor alignment between the modalities but also a missed opportunity to exploit rich self-supervision through existing temporal content in the data. In this work, we explicitly account for prior images and reports when available during both training and fine-tuning. Our approach, named BioViL-T,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2301.04558","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2301.04558/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2301.04558","created_at":"2026-07-05T07:47:55.059767+00:00"},{"alias_kind":"arxiv_version","alias_value":"2301.04558v2","created_at":"2026-07-05T07:47:55.059767+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2301.04558","created_at":"2026-07-05T07:47:55.059767+00:00"},{"alias_kind":"pith_short_12","alias_value":"JKFNN7TLO6AE","created_at":"2026-07-05T07:47:55.059767+00:00"},{"alias_kind":"pith_short_16","alias_value":"JKFNN7TLO6AEIXZS","created_at":"2026-07-05T07:47:55.059767+00:00"},{"alias_kind":"pith_short_8","alias_value":"JKFNN7TL","created_at":"2026-07-05T07:47:55.059767+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.29667","citing_title":"Unlocking the Visual Record of Materials Science: A Large-Scale Multimodal Dataset from Scientific Literature","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11304","citing_title":"CheXTemporal: A Dataset for Temporally-Grounded Reasoning in Chest Radiography","ref_index":12,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK","json":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK.json","graph_json":"https://pith.science/api/pith-number/JKFNN7TLO6AEIXZSJ4AW7MFCLK/graph.json","events_json":"https://pith.science/api/pith-number/JKFNN7TLO6AEIXZSJ4AW7MFCLK/events.json","paper":"https://pith.science/paper/JKFNN7TL"},"agent_actions":{"view_html":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK","download_json":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK.json","view_paper":"https://pith.science/paper/JKFNN7TL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2301.04558&json=true","fetch_graph":"https://pith.science/api/pith-number/JKFNN7TLO6AEIXZSJ4AW7MFCLK/graph.json","fetch_events":"https://pith.science/api/pith-number/JKFNN7TLO6AEIXZSJ4AW7MFCLK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK/action/storage_attestation","attest_author":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK/action/author_attestation","sign_citation":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK/action/citation_signature","submit_replication":"https://pith.science/pith/JKFNN7TLO6AEIXZSJ4AW7MFCLK/action/replication_record"}},"created_at":"2026-07-05T07:47:55.059767+00:00","updated_at":"2026-07-05T07:47:55.059767+00:00"}