{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:VNQ6SOSQB5APBZ2EWAIBCD2S2K","short_pith_number":"pith:VNQ6SOSQ","schema_version":"1.0","canonical_sha256":"ab61e93a500f40f0e744b010110f52d2a3ff5dfc4c6bd80b473bbf6b314abe96","source":{"kind":"arxiv","id":"2306.09344","version":3},"attestation_state":"computed","paper":{"title":"DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Lucy Chai, Netanel Tamir, Phillip Isola, Richard Zhang, Shobhita Sundaram, Stephanie Fu, Tali Dekel","submitted_at":"2023-06-15T17:59:50Z","abstract_excerpt":"Current perceptual similarity metrics operate at the level of pixels and patches. These metrics compare images in terms of their low-level colors and textures, but fail to capture mid-level similarities and differences in image layout, object pose, and semantic content. In this paper, we develop a perceptual metric that assesses images holistically. Our first step is to collect a new dataset of human similarity judgments over image pairs that are alike in diverse ways. Critical to this dataset is that judgments are nearly automatic and shared by all observers. To achieve this we use recent tex"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2306.09344","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-06-15T17:59:50Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"6b15559812b768478c4ce26a4e826d7526810fff09fa9c111b7dd7c11ac991f0","abstract_canon_sha256":"95057f0b1ace1c566b6c3d93c60ee09f9390c426aa371af21f4a01ac43098907"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:05:31.825623Z","signature_b64":"bOxYeZCuGWUyJmUhDoWlm+qMADPH3kwRiZIflTs79qiQvCfzp/883KpExDUP8Y9gx4/lmI+VqOZIU3FaRWRpBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ab61e93a500f40f0e744b010110f52d2a3ff5dfc4c6bd80b473bbf6b314abe96","last_reissued_at":"2026-05-27T01:05:31.824924Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:05:31.824924Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.CV","authors_text":"Lucy Chai, Netanel Tamir, Phillip Isola, Richard Zhang, Shobhita Sundaram, Stephanie Fu, Tali Dekel","submitted_at":"2023-06-15T17:59:50Z","abstract_excerpt":"Current perceptual similarity metrics operate at the level of pixels and patches. These metrics compare images in terms of their low-level colors and textures, but fail to capture mid-level similarities and differences in image layout, object pose, and semantic content. In this paper, we develop a perceptual metric that assesses images holistically. Our first step is to collect a new dataset of human similarity judgments over image pairs that are alike in diverse ways. Critical to this dataset is that judgments are nearly automatic and shared by all observers. To achieve this we use recent tex"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2306.09344","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2306.09344/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2306.09344","created_at":"2026-05-27T01:05:31.824995+00:00"},{"alias_kind":"arxiv_version","alias_value":"2306.09344v3","created_at":"2026-05-27T01:05:31.824995+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2306.09344","created_at":"2026-05-27T01:05:31.824995+00:00"},{"alias_kind":"pith_short_12","alias_value":"VNQ6SOSQB5AP","created_at":"2026-05-27T01:05:31.824995+00:00"},{"alias_kind":"pith_short_16","alias_value":"VNQ6SOSQB5APBZ2E","created_at":"2026-05-27T01:05:31.824995+00:00"},{"alias_kind":"pith_short_8","alias_value":"VNQ6SOSQ","created_at":"2026-05-27T01:05:31.824995+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":20,"internal_anchor_count":20,"sample":[{"citing_arxiv_id":"2505.17726","citing_title":"Slot-MLLM: Object-Centric Visual Tokenization for Multimodal LLM","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2512.12598","citing_title":"Setting the Stage: Text-Driven Scene-Consistent Image Generation","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20337","citing_title":"Capability $\\neq$ Interpretability: Human Interpretability of Vision Foundation Models","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20777","citing_title":"AttriStory: Fine-grained Attribute Realization for Visual Storytelling with Diffusion Models","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18324","citing_title":"Improved Baselines with Representation Autoencoders","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19032","citing_title":"Personalized Face Privacy Protection From a Single Image","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2508.09547","citing_title":"GoViG: Goal-Conditioned Visual Navigation Instruction Generation via Multimodal Reasoning","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2410.05160","citing_title":"VLM2Vec: Training Vision-Language Models for Massive Multimodal Embedding Tasks","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2512.19115","citing_title":"Generative Giants, Retrieval Weaklings: Why do Multimodal Large Language Models Fail at Multimodal Retrieval?","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2601.00090","citing_title":"It's Never Too Late: Noise Optimization for Collapse Recovery in Trained Diffusion Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02003","citing_title":"ProDiG: Progressive Diffusion-Guided Gaussian Splatting for Aerial to Ground Reconstruction","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06061","citing_title":"PromptEvolver: Prompt Inversion through Evolutionary Optimization in Natural-Language Space","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11927","citing_title":"RealDiffusion: Physics-informed Attention for Multi-character Storybook Generation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12090","citing_title":"World Action Models: The Next Frontier in Embodied AI","ref_index":209,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11797","citing_title":"SyncFix: Fixing 3D Reconstructions via Multi-View Synchronization","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08500","citing_title":"Novel View Synthesis as Video Completion","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05039","citing_title":"ID-Sim: An Identity-Focused Similarity Metric","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15453","citing_title":"(1D) Ordered Tokens Enable Efficient Test-Time Search","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22855","citing_title":"Evaluating Remote Sensing Image Captions Beyond Metric Biases","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02583","citing_title":"Stylistic Attribute Control in Latent Diffusion Models","ref_index":25,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K","json":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K.json","graph_json":"https://pith.science/api/pith-number/VNQ6SOSQB5APBZ2EWAIBCD2S2K/graph.json","events_json":"https://pith.science/api/pith-number/VNQ6SOSQB5APBZ2EWAIBCD2S2K/events.json","paper":"https://pith.science/paper/VNQ6SOSQ"},"agent_actions":{"view_html":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K","download_json":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K.json","view_paper":"https://pith.science/paper/VNQ6SOSQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2306.09344&json=true","fetch_graph":"https://pith.science/api/pith-number/VNQ6SOSQB5APBZ2EWAIBCD2S2K/graph.json","fetch_events":"https://pith.science/api/pith-number/VNQ6SOSQB5APBZ2EWAIBCD2S2K/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K/action/timestamp_anchor","attest_storage":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K/action/storage_attestation","attest_author":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K/action/author_attestation","sign_citation":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K/action/citation_signature","submit_replication":"https://pith.science/pith/VNQ6SOSQB5APBZ2EWAIBCD2S2K/action/replication_record"}},"created_at":"2026-05-27T01:05:31.824995+00:00","updated_at":"2026-05-27T01:05:31.824995+00:00"}