{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2017:6CKUK4ZIJ4V56JBR45GKYLHCGZ","short_pith_number":"pith:6CKUK4ZI","canonical_record":{"source":{"id":"1706.00079","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MM","submitted_at":"2017-05-31T20:35:26Z","cross_cats_sorted":["cs.CV","cs.SD"],"title_canon_sha256":"f60b432dfcde3e1d2f6492e90fd54af3cb5038f07d97146d745164c6e7c0496d","abstract_canon_sha256":"dcc0b53c83dfa891780fd61c1ce18a685f0d4630b9dcc16d3ff1a004857ad956"},"schema_version":"1.0"},"canonical_sha256":"f0954573284f2bdf2431e74cac2ce236419bea685a438651579e955c77d50665","source":{"kind":"arxiv","id":"1706.00079","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1706.00079","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"arxiv_version","alias_value":"1706.00079v1","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.00079","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"pith_short_12","alias_value":"6CKUK4ZIJ4V5","created_at":"2026-05-18T12:31:03Z"},{"alias_kind":"pith_short_16","alias_value":"6CKUK4ZIJ4V56JBR","created_at":"2026-05-18T12:31:03Z"},{"alias_kind":"pith_short_8","alias_value":"6CKUK4ZI","created_at":"2026-05-18T12:31:03Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2017:6CKUK4ZIJ4V56JBR45GKYLHCGZ","target":"record","payload":{"canonical_record":{"source":{"id":"1706.00079","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MM","submitted_at":"2017-05-31T20:35:26Z","cross_cats_sorted":["cs.CV","cs.SD"],"title_canon_sha256":"f60b432dfcde3e1d2f6492e90fd54af3cb5038f07d97146d745164c6e7c0496d","abstract_canon_sha256":"dcc0b53c83dfa891780fd61c1ce18a685f0d4630b9dcc16d3ff1a004857ad956"},"schema_version":"1.0"},"canonical_sha256":"f0954573284f2bdf2431e74cac2ce236419bea685a438651579e955c77d50665","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:43:15.087979Z","signature_b64":"Pjo3Oo0GM2RQFpWP0BvrxWZ4Vk0G+1VGHv6nygnByaFSySUnyfslV2RvmfoduWys9mI2Yf4yI7Cqcj7K6AZ1DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f0954573284f2bdf2431e74cac2ce236419bea685a438651579e955c77d50665","last_reissued_at":"2026-05-18T00:43:15.087257Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:43:15.087257Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1706.00079","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:43:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cOln79+w6E9GK7Z/UdPB8dS9+bKJ8yNBnB5rCBJpgys7kbvuzrF6+duaq1mJzGEpX8oUG9WAyuYYBfa6RhZsBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T01:50:44.123621Z"},"content_sha256":"33b070ab15cf43e9ed2216936cf89d050aa9d1add3e2941956452ac064f1594a","schema_version":"1.0","event_id":"sha256:33b070ab15cf43e9ed2216936cf89d050aa9d1add3e2941956452ac064f1594a"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2017:6CKUK4ZIJ4V56JBR45GKYLHCGZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Putting a Face to the Voice: Fusing Audio and Visual Signals Across a Video to Determine Speakers","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV","cs.SD"],"primary_cat":"cs.MM","authors_text":"Caroline Pantofaru, Ian Sturdy, Ken Hoover, Malcolm Slaney, Sourish Chaudhuri","submitted_at":"2017-05-31T20:35:26Z","abstract_excerpt":"In this paper, we present a system that associates faces with voices in a video by fusing information from the audio and visual signals. The thesis underlying our work is that an extremely simple approach to generating (weak) speech clusters can be combined with visual signals to effectively associate faces and voices by aggregating statistics across a video. This approach does not need any training data specific to this task and leverages the natural coherence of information in the audio and visual streams. It is particularly applicable to tracking speakers in videos on the web where a priori"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1706.00079","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:43:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"i0vm9tCmemD1WNN6C67UXwjvSFEtk/FE/2/IOerYkMTSPtJLGaCRlxefhZ65anEaPracZ116yiaPmq9AMO01CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T01:50:44.123981Z"},"content_sha256":"e20886539cec9c2cc586acd3420d3f53a8fa79348c167abc9b9dbd0d3a2df039","schema_version":"1.0","event_id":"sha256:e20886539cec9c2cc586acd3420d3f53a8fa79348c167abc9b9dbd0d3a2df039"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/bundle.json","state_url":"https://pith.science/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T01:50:44Z","links":{"resolver":"https://pith.science/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ","bundle":"https://pith.science/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/bundle.json","state":"https://pith.science/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/6CKUK4ZIJ4V56JBR45GKYLHCGZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2017:6CKUK4ZIJ4V56JBR45GKYLHCGZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"dcc0b53c83dfa891780fd61c1ce18a685f0d4630b9dcc16d3ff1a004857ad956","cross_cats_sorted":["cs.CV","cs.SD"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MM","submitted_at":"2017-05-31T20:35:26Z","title_canon_sha256":"f60b432dfcde3e1d2f6492e90fd54af3cb5038f07d97146d745164c6e7c0496d"},"schema_version":"1.0","source":{"id":"1706.00079","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1706.00079","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"arxiv_version","alias_value":"1706.00079v1","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.00079","created_at":"2026-05-18T00:43:15Z"},{"alias_kind":"pith_short_12","alias_value":"6CKUK4ZIJ4V5","created_at":"2026-05-18T12:31:03Z"},{"alias_kind":"pith_short_16","alias_value":"6CKUK4ZIJ4V56JBR","created_at":"2026-05-18T12:31:03Z"},{"alias_kind":"pith_short_8","alias_value":"6CKUK4ZI","created_at":"2026-05-18T12:31:03Z"}],"graph_snapshots":[{"event_id":"sha256:e20886539cec9c2cc586acd3420d3f53a8fa79348c167abc9b9dbd0d3a2df039","target":"graph","created_at":"2026-05-18T00:43:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"In this paper, we present a system that associates faces with voices in a video by fusing information from the audio and visual signals. The thesis underlying our work is that an extremely simple approach to generating (weak) speech clusters can be combined with visual signals to effectively associate faces and voices by aggregating statistics across a video. This approach does not need any training data specific to this task and leverages the natural coherence of information in the audio and visual streams. It is particularly applicable to tracking speakers in videos on the web where a priori","authors_text":"Caroline Pantofaru, Ian Sturdy, Ken Hoover, Malcolm Slaney, Sourish Chaudhuri","cross_cats":["cs.CV","cs.SD"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MM","submitted_at":"2017-05-31T20:35:26Z","title":"Putting a Face to the Voice: Fusing Audio and Visual Signals Across a Video to Determine Speakers"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1706.00079","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:33b070ab15cf43e9ed2216936cf89d050aa9d1add3e2941956452ac064f1594a","target":"record","created_at":"2026-05-18T00:43:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"dcc0b53c83dfa891780fd61c1ce18a685f0d4630b9dcc16d3ff1a004857ad956","cross_cats_sorted":["cs.CV","cs.SD"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MM","submitted_at":"2017-05-31T20:35:26Z","title_canon_sha256":"f60b432dfcde3e1d2f6492e90fd54af3cb5038f07d97146d745164c6e7c0496d"},"schema_version":"1.0","source":{"id":"1706.00079","kind":"arxiv","version":1}},"canonical_sha256":"f0954573284f2bdf2431e74cac2ce236419bea685a438651579e955c77d50665","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f0954573284f2bdf2431e74cac2ce236419bea685a438651579e955c77d50665","first_computed_at":"2026-05-18T00:43:15.087257Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:43:15.087257Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Pjo3Oo0GM2RQFpWP0BvrxWZ4Vk0G+1VGHv6nygnByaFSySUnyfslV2RvmfoduWys9mI2Yf4yI7Cqcj7K6AZ1DQ==","signature_status":"signed_v1","signed_at":"2026-05-18T00:43:15.087979Z","signed_message":"canonical_sha256_bytes"},"source_id":"1706.00079","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:33b070ab15cf43e9ed2216936cf89d050aa9d1add3e2941956452ac064f1594a","sha256:e20886539cec9c2cc586acd3420d3f53a8fa79348c167abc9b9dbd0d3a2df039"],"state_sha256":"58d01a451ef33ecd3038a631cd91b03b4f792377b6964574054de97773acdc10"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"KhFoFIPjxzbC8Rr/XrfSPdVHXl/Pka62MdhuA6ttejl1BHRZGPnLevBD7K39gQJoJ46ZCpnroD378d0NTVIpDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T01:50:44.126939Z","bundle_sha256":"b7c5f42f963552405be8d536f9d98e48893c3990a7e2a6991beb38808d5daedd"}}