{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:EM4TLXU4OLFH7DBYOMGMWKKZYC","short_pith_number":"pith:EM4TLXU4","schema_version":"1.0","canonical_sha256":"233935de9c72ca7f8c38730ccb2959c08cb178f262b90803af8481f6bd7fa92f","source":{"kind":"arxiv","id":"2606.02352","version":1},"attestation_state":"computed","paper":{"title":"Multi-modal Video Representation Alignment for Robust Self-supervised Driver Distraction Detection","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"David J. Lerch, Frederik Diederichs, Livien Majer, Manuel Martin, Rainer Stiefelhagen, Zeyun Zhong","submitted_at":"2026-06-01T15:01:17Z","abstract_excerpt":"Robust self-supervised learning of multi-modal video representations is critical for real-world applications such as driver distraction detection, where multiple sensors provide complementary but noisy signals. Conventional contrastive objectives, such as InfoNCE, assume all negatives are equally informative and all positives are reliable. However, this assumption is frequently violated in multi-modal data due to viewpoint changes, occlusions, or semantic overlap across modalities. In this work, we propose a novel framework for multi-modal global alignment that addresses these challenges by jo"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.02352","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-06-01T15:01:17Z","cross_cats_sorted":[],"title_canon_sha256":"d0a06367b91fdc78baccebe097b857a51ad9aa466a0078ef75c2c18a8802b63b","abstract_canon_sha256":"c7064ef2fa6742d5658072b78964194c932c127bdc3122c2dd414e5c0cdbd51b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T03:04:56.836784Z","signature_b64":"UhfpuHKfCKWByTdSwTe1si/xZll69MXIKiyiT219SAZSKPuVlSp3ZQmS9HxyeORTNclLs+lWhUZ7HPo3rKeFDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"233935de9c72ca7f8c38730ccb2959c08cb178f262b90803af8481f6bd7fa92f","last_reissued_at":"2026-06-02T03:04:56.836385Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T03:04:56.836385Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Multi-modal Video Representation Alignment for Robust Self-supervised Driver Distraction Detection","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"David J. Lerch, Frederik Diederichs, Livien Majer, Manuel Martin, Rainer Stiefelhagen, Zeyun Zhong","submitted_at":"2026-06-01T15:01:17Z","abstract_excerpt":"Robust self-supervised learning of multi-modal video representations is critical for real-world applications such as driver distraction detection, where multiple sensors provide complementary but noisy signals. Conventional contrastive objectives, such as InfoNCE, assume all negatives are equally informative and all positives are reliable. However, this assumption is frequently violated in multi-modal data due to viewpoint changes, occlusions, or semantic overlap across modalities. In this work, we propose a novel framework for multi-modal global alignment that addresses these challenges by jo"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.02352","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.02352/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.02352","created_at":"2026-06-02T03:04:56.836440+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.02352v1","created_at":"2026-06-02T03:04:56.836440+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.02352","created_at":"2026-06-02T03:04:56.836440+00:00"},{"alias_kind":"pith_short_12","alias_value":"EM4TLXU4OLFH","created_at":"2026-06-02T03:04:56.836440+00:00"},{"alias_kind":"pith_short_16","alias_value":"EM4TLXU4OLFH7DBY","created_at":"2026-06-02T03:04:56.836440+00:00"},{"alias_kind":"pith_short_8","alias_value":"EM4TLXU4","created_at":"2026-06-02T03:04:56.836440+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC","json":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC.json","graph_json":"https://pith.science/api/pith-number/EM4TLXU4OLFH7DBYOMGMWKKZYC/graph.json","events_json":"https://pith.science/api/pith-number/EM4TLXU4OLFH7DBYOMGMWKKZYC/events.json","paper":"https://pith.science/paper/EM4TLXU4"},"agent_actions":{"view_html":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC","download_json":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC.json","view_paper":"https://pith.science/paper/EM4TLXU4","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.02352&json=true","fetch_graph":"https://pith.science/api/pith-number/EM4TLXU4OLFH7DBYOMGMWKKZYC/graph.json","fetch_events":"https://pith.science/api/pith-number/EM4TLXU4OLFH7DBYOMGMWKKZYC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC/action/storage_attestation","attest_author":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC/action/author_attestation","sign_citation":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC/action/citation_signature","submit_replication":"https://pith.science/pith/EM4TLXU4OLFH7DBYOMGMWKKZYC/action/replication_record"}},"created_at":"2026-06-02T03:04:56.836440+00:00","updated_at":"2026-06-02T03:04:56.836440+00:00"}