{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:577PK3UQPTJWWV6ZJAPVBTZPZ3","short_pith_number":"pith:577PK3UQ","schema_version":"1.0","canonical_sha256":"effef56e907cd36b57d9481f50cf2fcefea16b87fdce805a136d6c7cfbf4de50","source":{"kind":"arxiv","id":"2605.27920","version":1},"attestation_state":"computed","paper":{"title":"Rethinking Video-Language Model from the Language Input Perspective","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Changshuo Wang, Daizong Liu, Wanlong Fang, Xiang Fang, Xiaoye Qu","submitted_at":"2026-05-27T03:47:05Z","abstract_excerpt":"Driven by the wave of large language models, Video-Language Models (VLMs) have become a significant yet challenging technology to bridge the gap between videos and texts. Although previous VLM works have made significant progress, almost all of them implicitly assume that all the texts are predefined by the specific template. In real-world applications, such a strict assumption is impossible to satisfy since 1) predefining all the texts is extremely time-consuming and labor-intensive. 2) these predefined text inputs are too restrictive and user-unfriendly, limiting their applications. It is ob"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.27920","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-27T03:47:05Z","cross_cats_sorted":[],"title_canon_sha256":"b646c6b5aa374e8fff21fe45f3b67a5f19b082b4a96f2571f41503af1f351854","abstract_canon_sha256":"602655a4f41e710029c01a5efeb4eb39335b2ef5e81530b1f4705379600e5025"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:04:52.554398Z","signature_b64":"YNylTs2pv4kwO1NJT1hWmBbndw1iPUcXT22L24CrKt02+U85vxW8iAuHjPISIqMIKh25bp86w9X+QQbvo1e9Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"effef56e907cd36b57d9481f50cf2fcefea16b87fdce805a136d6c7cfbf4de50","last_reissued_at":"2026-05-28T01:04:52.553942Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:04:52.553942Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Rethinking Video-Language Model from the Language Input Perspective","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Changshuo Wang, Daizong Liu, Wanlong Fang, Xiang Fang, Xiaoye Qu","submitted_at":"2026-05-27T03:47:05Z","abstract_excerpt":"Driven by the wave of large language models, Video-Language Models (VLMs) have become a significant yet challenging technology to bridge the gap between videos and texts. Although previous VLM works have made significant progress, almost all of them implicitly assume that all the texts are predefined by the specific template. In real-world applications, such a strict assumption is impossible to satisfy since 1) predefining all the texts is extremely time-consuming and labor-intensive. 2) these predefined text inputs are too restrictive and user-unfriendly, limiting their applications. It is ob"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.27920","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.27920/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.27920","created_at":"2026-05-28T01:04:52.554005+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.27920v1","created_at":"2026-05-28T01:04:52.554005+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.27920","created_at":"2026-05-28T01:04:52.554005+00:00"},{"alias_kind":"pith_short_12","alias_value":"577PK3UQPTJW","created_at":"2026-05-28T01:04:52.554005+00:00"},{"alias_kind":"pith_short_16","alias_value":"577PK3UQPTJWWV6Z","created_at":"2026-05-28T01:04:52.554005+00:00"},{"alias_kind":"pith_short_8","alias_value":"577PK3UQ","created_at":"2026-05-28T01:04:52.554005+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3","json":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3.json","graph_json":"https://pith.science/api/pith-number/577PK3UQPTJWWV6ZJAPVBTZPZ3/graph.json","events_json":"https://pith.science/api/pith-number/577PK3UQPTJWWV6ZJAPVBTZPZ3/events.json","paper":"https://pith.science/paper/577PK3UQ"},"agent_actions":{"view_html":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3","download_json":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3.json","view_paper":"https://pith.science/paper/577PK3UQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.27920&json=true","fetch_graph":"https://pith.science/api/pith-number/577PK3UQPTJWWV6ZJAPVBTZPZ3/graph.json","fetch_events":"https://pith.science/api/pith-number/577PK3UQPTJWWV6ZJAPVBTZPZ3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3/action/storage_attestation","attest_author":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3/action/author_attestation","sign_citation":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3/action/citation_signature","submit_replication":"https://pith.science/pith/577PK3UQPTJWWV6ZJAPVBTZPZ3/action/replication_record"}},"created_at":"2026-05-28T01:04:52.554005+00:00","updated_at":"2026-05-28T01:04:52.554005+00:00"}