{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:CQCVNKEYX5VH747EJVNPHSH6YV","short_pith_number":"pith:CQCVNKEY","schema_version":"1.0","canonical_sha256":"140556a898bf6a7ff3e44d5af3c8fec55d9be5747aa512a5bc8aab65fadf15c2","source":{"kind":"arxiv","id":"1809.01337","version":1},"attestation_state":"computed","paper":{"title":"Localizing Moments in Video with Temporal Language","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bryan Russell, Eli Shechtman, Josef Sivic, Lisa Anne Hendricks, Oliver Wang, Trevor Darrell","submitted_at":"2018-09-05T05:58:47Z","abstract_excerpt":"Localizing moments in a longer video via natural language queries is a new, challenging task at the intersection of language and video understanding. Though moment localization with natural language is similar to other language and vision tasks like natural language object retrieval in images, moment localization offers an interesting opportunity to model temporal dependencies and reasoning in text. We propose a new model that explicitly reasons about different temporal segments in a video, and shows that temporal context is important for localizing phrases which include temporal language. To "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.01337","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-09-05T05:58:47Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"240adb1f6781f0916ed2def5c922e778002d1a872651f1e74c2f385a82823fef","abstract_canon_sha256":"873ed413d27bc44e7b8db110b2a09f6577e52d3031f1e48abca45b38112d25f9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:06:27.936329Z","signature_b64":"G/T5bcyknl83IjCjTRtcgQdmD5xqm+cpx78Uc2EReQdHkXtJ5XGmUKrvO6gEp/iBa1sbdv7MX2VIbZsHJs3vAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"140556a898bf6a7ff3e44d5af3c8fec55d9be5747aa512a5bc8aab65fadf15c2","last_reissued_at":"2026-05-18T00:06:27.935788Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:06:27.935788Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Localizing Moments in Video with Temporal Language","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bryan Russell, Eli Shechtman, Josef Sivic, Lisa Anne Hendricks, Oliver Wang, Trevor Darrell","submitted_at":"2018-09-05T05:58:47Z","abstract_excerpt":"Localizing moments in a longer video via natural language queries is a new, challenging task at the intersection of language and video understanding. Though moment localization with natural language is similar to other language and vision tasks like natural language object retrieval in images, moment localization offers an interesting opportunity to model temporal dependencies and reasoning in text. We propose a new model that explicitly reasons about different temporal segments in a video, and shows that temporal context is important for localizing phrases which include temporal language. To "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.01337","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.01337","created_at":"2026-05-18T00:06:27.935880+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.01337v1","created_at":"2026-05-18T00:06:27.935880+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.01337","created_at":"2026-05-18T00:06:27.935880+00:00"},{"alias_kind":"pith_short_12","alias_value":"CQCVNKEYX5VH","created_at":"2026-05-18T12:32:16.446611+00:00"},{"alias_kind":"pith_short_16","alias_value":"CQCVNKEYX5VH747E","created_at":"2026-05-18T12:32:16.446611+00:00"},{"alias_kind":"pith_short_8","alias_value":"CQCVNKEY","created_at":"2026-05-18T12:32:16.446611+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2312.02549","citing_title":"DemaFormer: Damped Exponential Moving Average Transformer with Energy-Based Modeling for Temporal Language Grounding","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2310.01852","citing_title":"LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment","ref_index":90,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV","json":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV.json","graph_json":"https://pith.science/api/pith-number/CQCVNKEYX5VH747EJVNPHSH6YV/graph.json","events_json":"https://pith.science/api/pith-number/CQCVNKEYX5VH747EJVNPHSH6YV/events.json","paper":"https://pith.science/paper/CQCVNKEY"},"agent_actions":{"view_html":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV","download_json":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV.json","view_paper":"https://pith.science/paper/CQCVNKEY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.01337&json=true","fetch_graph":"https://pith.science/api/pith-number/CQCVNKEYX5VH747EJVNPHSH6YV/graph.json","fetch_events":"https://pith.science/api/pith-number/CQCVNKEYX5VH747EJVNPHSH6YV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV/action/storage_attestation","attest_author":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV/action/author_attestation","sign_citation":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV/action/citation_signature","submit_replication":"https://pith.science/pith/CQCVNKEYX5VH747EJVNPHSH6YV/action/replication_record"}},"created_at":"2026-05-18T00:06:27.935880+00:00","updated_at":"2026-05-18T00:06:27.935880+00:00"}