{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:MTO732OKVJ32L75FQIEZXKWQIL","short_pith_number":"pith:MTO732OK","schema_version":"1.0","canonical_sha256":"64ddfde9caaa77a5ffa582099baad042fc0803e8579fb549c3fd660e8ff9c901","source":{"kind":"arxiv","id":"2308.09351","version":1},"attestation_state":"computed","paper":{"title":"RLIPv2: Fast Scaling of Relational Language-Image Pre-training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.MM"],"primary_cat":"cs.CV","authors_text":"Deli Zhao, Dong Ni, Hangjie Yuan, Jianwen Jiang, Samuel Albanie, Shiwei Zhang, Tao Feng, Xiang Wang, Yingya Zhang, Yining Pan","submitted_at":"2023-08-18T07:17:09Z","abstract_excerpt":"Relational Language-Image Pre-training (RLIP) aims to align vision representations with relational texts, thereby advancing the capability of relational reasoning in computer vision tasks. However, hindered by the slow convergence of RLIPv1 architecture and the limited availability of existing scene graph data, scaling RLIPv1 is challenging. In this paper, we propose RLIPv2, a fast converging model that enables the scaling of relational pre-training to large-scale pseudo-labelled scene graph data. To enable fast scaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism th"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2308.09351","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2023-08-18T07:17:09Z","cross_cats_sorted":["cs.AI","cs.LG","cs.MM"],"title_canon_sha256":"e1e7818069a286354947192fd46a7aa296203a81897e948b219c890eb4ecf68d","abstract_canon_sha256":"14239cadde485168ba87bddf1107eba2f21262682ee41f5f522c0d2f84ee215f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T06:42:31.290450Z","signature_b64":"odeRtvT2kQC5A5YW1p/sb7qW1Ngrqfhmzc6JV6HHe+0iCVYz7v5CkdKWfjrQuFsVfCfDJSbtlg11ZNjIsH1mAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"64ddfde9caaa77a5ffa582099baad042fc0803e8579fb549c3fd660e8ff9c901","last_reissued_at":"2026-07-05T06:42:31.290066Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T06:42:31.290066Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"RLIPv2: Fast Scaling of Relational Language-Image Pre-training","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.MM"],"primary_cat":"cs.CV","authors_text":"Deli Zhao, Dong Ni, Hangjie Yuan, Jianwen Jiang, Samuel Albanie, Shiwei Zhang, Tao Feng, Xiang Wang, Yingya Zhang, Yining Pan","submitted_at":"2023-08-18T07:17:09Z","abstract_excerpt":"Relational Language-Image Pre-training (RLIP) aims to align vision representations with relational texts, thereby advancing the capability of relational reasoning in computer vision tasks. However, hindered by the slow convergence of RLIPv1 architecture and the limited availability of existing scene graph data, scaling RLIPv1 is challenging. In this paper, we propose RLIPv2, a fast converging model that enables the scaling of relational pre-training to large-scale pseudo-labelled scene graph data. To enable fast scaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism th"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2308.09351","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2308.09351/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2308.09351","created_at":"2026-07-05T06:42:31.290122+00:00"},{"alias_kind":"arxiv_version","alias_value":"2308.09351v1","created_at":"2026-07-05T06:42:31.290122+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2308.09351","created_at":"2026-07-05T06:42:31.290122+00:00"},{"alias_kind":"pith_short_12","alias_value":"MTO732OKVJ32","created_at":"2026-07-05T06:42:31.290122+00:00"},{"alias_kind":"pith_short_16","alias_value":"MTO732OKVJ32L75F","created_at":"2026-07-05T06:42:31.290122+00:00"},{"alias_kind":"pith_short_8","alias_value":"MTO732OK","created_at":"2026-07-05T06:42:31.290122+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL","json":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL.json","graph_json":"https://pith.science/api/pith-number/MTO732OKVJ32L75FQIEZXKWQIL/graph.json","events_json":"https://pith.science/api/pith-number/MTO732OKVJ32L75FQIEZXKWQIL/events.json","paper":"https://pith.science/paper/MTO732OK"},"agent_actions":{"view_html":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL","download_json":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL.json","view_paper":"https://pith.science/paper/MTO732OK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2308.09351&json=true","fetch_graph":"https://pith.science/api/pith-number/MTO732OKVJ32L75FQIEZXKWQIL/graph.json","fetch_events":"https://pith.science/api/pith-number/MTO732OKVJ32L75FQIEZXKWQIL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL/action/storage_attestation","attest_author":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL/action/author_attestation","sign_citation":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL/action/citation_signature","submit_replication":"https://pith.science/pith/MTO732OKVJ32L75FQIEZXKWQIL/action/replication_record"}},"created_at":"2026-07-05T06:42:31.290122+00:00","updated_at":"2026-07-05T06:42:31.290122+00:00"}