{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:2D4W3XNJE7NV6KSGR5NXZB43WM","short_pith_number":"pith:2D4W3XNJ","schema_version":"1.0","canonical_sha256":"d0f96ddda927db5f2a468f5b7c879bb308a9ddb8dcc468f4ac190f01d0edc690","source":{"kind":"arxiv","id":"2605.16918","version":1},"attestation_state":"computed","paper":{"title":"HighSync: High-Quality Lip Synchronization via Latent Diffusion Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"HighSync generates photorealistic lip-synced videos at 512x512 by removing data leakage that blocked genuine audio dependence.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Majid Iranpour Mobarekeh, Mehdi Bagheri, Mostafa Alavi, Saeed Firouzi Daghigh","submitted_at":"2026-05-16T10:20:52Z","abstract_excerpt":"We present HighSync, an end-to-end diffusion-based framework for high-fidelity lip synchronization that generates photorealistic talking-face videos aligned with arbitrary input audio. Existing approaches consistently struggle to reconcile image quality with synchronization accuracy, producing either visually degraded outputs or temporally inconsistent lip movements. HighSync addresses both challenges simultaneously and, to our knowledge, is the first lip sync model to operate natively at 512*512 resolution, positioning it as a viable solution for professional production environments such as t"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.16918","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-16T10:20:52Z","cross_cats_sorted":[],"title_canon_sha256":"5936ecc3e2a5f811e6b6da38c11863212823a7e903bc3f0b4e1e4ebad05a0e73","abstract_canon_sha256":"a87b49862d413237c930b251a7ff46f7969792feda0ecc4759f413fede0ad3eb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:30.400085Z","signature_b64":"gPFBFiH9mPGoFsKLX+wLXtsKXrTOR5428FwvhZCpT8WGfwuUtiNu1dOxkf2NPrvQ9992aR/OSYvB/huZx/b3Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d0f96ddda927db5f2a468f5b7c879bb308a9ddb8dcc468f4ac190f01d0edc690","last_reissued_at":"2026-05-20T00:03:30.399374Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:30.399374Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"HighSync: High-Quality Lip Synchronization via Latent Diffusion Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"HighSync generates photorealistic lip-synced videos at 512x512 by removing data leakage that blocked genuine audio dependence.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Majid Iranpour Mobarekeh, Mehdi Bagheri, Mostafa Alavi, Saeed Firouzi Daghigh","submitted_at":"2026-05-16T10:20:52Z","abstract_excerpt":"We present HighSync, an end-to-end diffusion-based framework for high-fidelity lip synchronization that generates photorealistic talking-face videos aligned with arbitrary input audio. Existing approaches consistently struggle to reconcile image quality with synchronization accuracy, producing either visually degraded outputs or temporally inconsistent lip movements. HighSync addresses both challenges simultaneously and, to our knowledge, is the first lip sync model to operate natively at 512*512 resolution, positioning it as a viable solution for professional production environments such as t"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"HighSync addresses both challenges simultaneously and, to our knowledge, is the first lip sync model to operate natively at 512*512 resolution, positioning it as a viable solution for professional production environments such as the film and broadcast industries.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The central premise that the identified data leakage phenomenon was silently undermining temporal modeling in all prior work and that its systematic elimination directly produces genuine audio dependence without introducing new artifacts or requiring other unstated modeling changes.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"HighSync is a diffusion-based lip synchronization system that operates natively at 512x512 resolution by eliminating data leakage to enforce genuine audio dependence and reports state-of-the-art results on quality and sync metrics.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"HighSync generates photorealistic lip-synced videos at 512x512 by removing data leakage that blocked genuine audio dependence.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"842756300b9bbef4a38d7e65c3a7ed273d01310966757029f88a805ecc4609bd"},"source":{"id":"2605.16918","kind":"arxiv","version":1},"verdict":{"id":"b4d512ca-2212-4044-9601-381ab717cf71","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T21:10:47.162529Z","strongest_claim":"HighSync addresses both challenges simultaneously and, to our knowledge, is the first lip sync model to operate natively at 512*512 resolution, positioning it as a viable solution for professional production environments such as the film and broadcast industries.","one_line_summary":"HighSync is a diffusion-based lip synchronization system that operates natively at 512x512 resolution by eliminating data leakage to enforce genuine audio dependence and reports state-of-the-art results on quality and sync metrics.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The central premise that the identified data leakage phenomenon was silently undermining temporal modeling in all prior work and that its systematic elimination directly produces genuine audio dependence without introducing new artifacts or requiring other unstated modeling changes.","pith_extraction_headline":"HighSync generates photorealistic lip-synced videos at 512x512 by removing data leakage that blocked genuine audio dependence."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16918/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T21:31:19.143935Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T21:20:52.894160Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"cited_work_retraction","ran_at":"2026-05-19T20:51:56.582040Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T18:41:56.264001Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.344353Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"0ede16f21b5400c0a3c960801c706129f38a7ded2fc75dcb9228e211f73e3e03"},"references":{"count":28,"sample":[{"doi":"","year":2020,"title":"A lip sync expert is all you need for speech to lip generation in the wild,","work_id":"7fb34ac4-2b8c-4064-8a01-ac1c71ccb7d1","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Diff2Lip: Audio conditioned diffusion models for lip-synchronization,","work_id":"d61877f8-fce6-49d5-95c1-671524ab4c9b","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Yuet al., “Make your actor talk: Generalizable and high-fidelity 11 Fig","work_id":"038890fc-22aa-4643-a880-ac096e3dbff8","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Latentsync: Taming audio-conditioned latent diffusion models for lip sync with syncnet supervision","work_id":"58bec219-b581-4884-a7c0-51325341150e","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"MuseTalk: Real-time high-fidelity video dubbing via spatio-temporal sampling,","work_id":"fa706bf8-0093-4b79-bfb9-67a09220ee2e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":28,"snapshot_sha256":"cdd19aec3555ce3a3dbe114bd66a13c528a7860e79fb1e413c7993efee55571c","internal_anchors":5},"formal_canon":{"evidence_count":2,"snapshot_sha256":"eeed5f3d949bac802ae7816fb398d0e9bc4f34925784e6c6151d059210c4431e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.16918","created_at":"2026-05-20T00:03:30.399474+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.16918v1","created_at":"2026-05-20T00:03:30.399474+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16918","created_at":"2026-05-20T00:03:30.399474+00:00"},{"alias_kind":"pith_short_12","alias_value":"2D4W3XNJE7NV","created_at":"2026-05-20T00:03:30.399474+00:00"},{"alias_kind":"pith_short_16","alias_value":"2D4W3XNJE7NV6KSG","created_at":"2026-05-20T00:03:30.399474+00:00"},{"alias_kind":"pith_short_8","alias_value":"2D4W3XNJ","created_at":"2026-05-20T00:03:30.399474+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM","json":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM.json","graph_json":"https://pith.science/api/pith-number/2D4W3XNJE7NV6KSGR5NXZB43WM/graph.json","events_json":"https://pith.science/api/pith-number/2D4W3XNJE7NV6KSGR5NXZB43WM/events.json","paper":"https://pith.science/paper/2D4W3XNJ"},"agent_actions":{"view_html":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM","download_json":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM.json","view_paper":"https://pith.science/paper/2D4W3XNJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.16918&json=true","fetch_graph":"https://pith.science/api/pith-number/2D4W3XNJE7NV6KSGR5NXZB43WM/graph.json","fetch_events":"https://pith.science/api/pith-number/2D4W3XNJE7NV6KSGR5NXZB43WM/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM/action/storage_attestation","attest_author":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM/action/author_attestation","sign_citation":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM/action/citation_signature","submit_replication":"https://pith.science/pith/2D4W3XNJE7NV6KSGR5NXZB43WM/action/replication_record"}},"created_at":"2026-05-20T00:03:30.399474+00:00","updated_at":"2026-05-20T00:03:30.399474+00:00"}