{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:244N6LUAE5EOMS2MABMCSN73IR","short_pith_number":"pith:244N6LUA","schema_version":"1.0","canonical_sha256":"d738df2e802748e64b4c00582937fb444fd079bf7e6af2fec9f10c013b3cf833","source":{"kind":"arxiv","id":"2606.07182","version":1},"attestation_state":"computed","paper":{"title":"Audio Imitator: Controlling Timbre and Tempo in Video2Audio Synthesis with Audio Reference","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"eess.AS","authors_text":"Cheng Gong, Chunyu Qiang, Feng Deng, Jiahui Zhao, Longbiao Wang, Tianrui Wang, Xijuan Zeng","submitted_at":"2026-06-05T11:46:28Z","abstract_excerpt":"Video-to-audio generation has made significant progress in achieving semantic consistency and temporal alignment from silent videos. However, audio contains rich stylistic attributes such as timbre and tempo that are difficult to infer from visual and textual inputs alone. While reference audio can serve as additional conditioning, it is typically treated as a holistic signal, limiting fine-grained style control. We propose AudioIM, an attribute-aware framework that explicitly models timbre and tempo as separate control factors rather than relying on holistic prompt conditioning. Dual encoders"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.07182","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"eess.AS","submitted_at":"2026-06-05T11:46:28Z","cross_cats_sorted":[],"title_canon_sha256":"bcdb31f7c7e24e7c2ee1067af2d13138f6be4f6e6462a534b171860de21a5a90","abstract_canon_sha256":"421a7db6beab604d53d31cb7061e0323bf18d23f903d898f2e736283b8a54279"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:04:51.283692Z","signature_b64":"gwzjKD+UG/yHHxwryXJ/3Vj0MN04KkHGZM+9NIdqHfFnU1J44X+4y2RiobwXe0R2lm/IKY8fxIsOu+QkPsfhAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d738df2e802748e64b4c00582937fb444fd079bf7e6af2fec9f10c013b3cf833","last_reissued_at":"2026-06-08T01:04:51.282864Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:04:51.282864Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Audio Imitator: Controlling Timbre and Tempo in Video2Audio Synthesis with Audio Reference","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"eess.AS","authors_text":"Cheng Gong, Chunyu Qiang, Feng Deng, Jiahui Zhao, Longbiao Wang, Tianrui Wang, Xijuan Zeng","submitted_at":"2026-06-05T11:46:28Z","abstract_excerpt":"Video-to-audio generation has made significant progress in achieving semantic consistency and temporal alignment from silent videos. However, audio contains rich stylistic attributes such as timbre and tempo that are difficult to infer from visual and textual inputs alone. While reference audio can serve as additional conditioning, it is typically treated as a holistic signal, limiting fine-grained style control. We propose AudioIM, an attribute-aware framework that explicitly models timbre and tempo as separate control factors rather than relying on holistic prompt conditioning. Dual encoders"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.07182","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.07182/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.07182","created_at":"2026-06-08T01:04:51.282971+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.07182v1","created_at":"2026-06-08T01:04:51.282971+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.07182","created_at":"2026-06-08T01:04:51.282971+00:00"},{"alias_kind":"pith_short_12","alias_value":"244N6LUAE5EO","created_at":"2026-06-08T01:04:51.282971+00:00"},{"alias_kind":"pith_short_16","alias_value":"244N6LUAE5EOMS2M","created_at":"2026-06-08T01:04:51.282971+00:00"},{"alias_kind":"pith_short_8","alias_value":"244N6LUA","created_at":"2026-06-08T01:04:51.282971+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR","json":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR.json","graph_json":"https://pith.science/api/pith-number/244N6LUAE5EOMS2MABMCSN73IR/graph.json","events_json":"https://pith.science/api/pith-number/244N6LUAE5EOMS2MABMCSN73IR/events.json","paper":"https://pith.science/paper/244N6LUA"},"agent_actions":{"view_html":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR","download_json":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR.json","view_paper":"https://pith.science/paper/244N6LUA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.07182&json=true","fetch_graph":"https://pith.science/api/pith-number/244N6LUAE5EOMS2MABMCSN73IR/graph.json","fetch_events":"https://pith.science/api/pith-number/244N6LUAE5EOMS2MABMCSN73IR/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR/action/timestamp_anchor","attest_storage":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR/action/storage_attestation","attest_author":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR/action/author_attestation","sign_citation":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR/action/citation_signature","submit_replication":"https://pith.science/pith/244N6LUAE5EOMS2MABMCSN73IR/action/replication_record"}},"created_at":"2026-06-08T01:04:51.282971+00:00","updated_at":"2026-06-08T01:04:51.282971+00:00"}