{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:DNTK7LWW6NNVW44ASAJCYMIFZ4","short_pith_number":"pith:DNTK7LWW","canonical_record":{"source":{"id":"2405.17430","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-05-27T17:59:56Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"df4adedd602d8cb146524e5de01236c4c4e1c8b3da415a9e2e30f633bcf6b200","abstract_canon_sha256":"3ebb791f7b844c3fe1292c07c88ec72e18bd006d4659c5d1fcaa08a6a324d0f4"},"schema_version":"1.0"},"canonical_sha256":"1b66afaed6f35b5b738090122c3105cf14b6eebdecbafa9eb1eb23e414347385","source":{"kind":"arxiv","id":"2405.17430","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2405.17430","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"arxiv_version","alias_value":"2405.17430v2","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2405.17430","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_12","alias_value":"DNTK7LWW6NNV","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_16","alias_value":"DNTK7LWW6NNVW44A","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_8","alias_value":"DNTK7LWW","created_at":"2026-07-05T08:49:34Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:DNTK7LWW6NNVW44ASAJCYMIFZ4","target":"record","payload":{"canonical_record":{"source":{"id":"2405.17430","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-05-27T17:59:56Z","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"title_canon_sha256":"df4adedd602d8cb146524e5de01236c4c4e1c8b3da415a9e2e30f633bcf6b200","abstract_canon_sha256":"3ebb791f7b844c3fe1292c07c88ec72e18bd006d4659c5d1fcaa08a6a324d0f4"},"schema_version":"1.0"},"canonical_sha256":"1b66afaed6f35b5b738090122c3105cf14b6eebdecbafa9eb1eb23e414347385","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T08:49:34.822966Z","signature_b64":"EK4qEhtyCHz6AcvSjN/er9F/QVFt5LD5MILiNRK9/hUfXo235VYeit8Yz5eQIakZ8hLu9NYg/qtDsWyiMWPzAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1b66afaed6f35b5b738090122c3105cf14b6eebdecbafa9eb1eb23e414347385","last_reissued_at":"2026-07-05T08:49:34.822451Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T08:49:34.822451Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2405.17430","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-07-05T08:49:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"QZkev1VVnnK37Sshk6GlMyvWDk2q1Q6NhvjHaWsfQXi5c3ZUyy5jPW4fJ6f0lB+9V67+XoQX62YKsvGz7SzADw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-05T15:51:01.175172Z"},"content_sha256":"3b9c7aabac2c5b3e666af69cc2c0165dc7356b4a4bed76ad63b60fda994d4f08","schema_version":"1.0","event_id":"sha256:3b9c7aabac2c5b3e666af69cc2c0165dc7356b4a4bed76ad63b60fda994d4f08"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:DNTK7LWW6NNVW44ASAJCYMIFZ4","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Matryoshka Multimodal Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL","cs.LG"],"primary_cat":"cs.CV","authors_text":"Jianfeng Gao, Jianwei Yang, Mu Cai, Yong Jae Lee","submitted_at":"2024-05-27T17:59:56Z","abstract_excerpt":"Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in visual-linguistic reasoning. These models first embed images into a fixed large number of visual tokens and then feed them into a Large Language Model (LLM). However, this design causes an excessive number of tokens for dense visual scenarios such as high-resolution images and videos, leading to great inefficiency. While token pruning/merging methods do exist, they produce a single length output for each image and do not afford flexibility in trading off information density v.s. efficiency. Inspired by the concept of"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2405.17430","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2405.17430/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-07-05T08:49:34Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"b9gwPx5Pcbt7wI+SQskflahpQ4sZ96n+vib3rxASgUKY8YKo438b5IBMooprVJ/JvCvy1o58yh6v3m3GDw+FAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-05T15:51:01.175845Z"},"content_sha256":"109046490b77bf51b72242d8ebe72654a89a18b4bc0ed9877635c895a8ad3fb6","schema_version":"1.0","event_id":"sha256:109046490b77bf51b72242d8ebe72654a89a18b4bc0ed9877635c895a8ad3fb6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/bundle.json","state_url":"https://pith.science/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-05T15:51:01Z","links":{"resolver":"https://pith.science/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4","bundle":"https://pith.science/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/bundle.json","state":"https://pith.science/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DNTK7LWW6NNVW44ASAJCYMIFZ4/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:DNTK7LWW6NNVW44ASAJCYMIFZ4","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3ebb791f7b844c3fe1292c07c88ec72e18bd006d4659c5d1fcaa08a6a324d0f4","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-05-27T17:59:56Z","title_canon_sha256":"df4adedd602d8cb146524e5de01236c4c4e1c8b3da415a9e2e30f633bcf6b200"},"schema_version":"1.0","source":{"id":"2405.17430","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2405.17430","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"arxiv_version","alias_value":"2405.17430v2","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2405.17430","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_12","alias_value":"DNTK7LWW6NNV","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_16","alias_value":"DNTK7LWW6NNVW44A","created_at":"2026-07-05T08:49:34Z"},{"alias_kind":"pith_short_8","alias_value":"DNTK7LWW","created_at":"2026-07-05T08:49:34Z"}],"graph_snapshots":[{"event_id":"sha256:109046490b77bf51b72242d8ebe72654a89a18b4bc0ed9877635c895a8ad3fb6","target":"graph","created_at":"2026-07-05T08:49:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2405.17430/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large Multimodal Models (LMMs) such as LLaVA have shown strong performance in visual-linguistic reasoning. These models first embed images into a fixed large number of visual tokens and then feed them into a Large Language Model (LLM). However, this design causes an excessive number of tokens for dense visual scenarios such as high-resolution images and videos, leading to great inefficiency. While token pruning/merging methods do exist, they produce a single length output for each image and do not afford flexibility in trading off information density v.s. efficiency. Inspired by the concept of","authors_text":"Jianfeng Gao, Jianwei Yang, Mu Cai, Yong Jae Lee","cross_cats":["cs.AI","cs.CL","cs.LG"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-05-27T17:59:56Z","title":"Matryoshka Multimodal Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2405.17430","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:3b9c7aabac2c5b3e666af69cc2c0165dc7356b4a4bed76ad63b60fda994d4f08","target":"record","created_at":"2026-07-05T08:49:34Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3ebb791f7b844c3fe1292c07c88ec72e18bd006d4659c5d1fcaa08a6a324d0f4","cross_cats_sorted":["cs.AI","cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2024-05-27T17:59:56Z","title_canon_sha256":"df4adedd602d8cb146524e5de01236c4c4e1c8b3da415a9e2e30f633bcf6b200"},"schema_version":"1.0","source":{"id":"2405.17430","kind":"arxiv","version":2}},"canonical_sha256":"1b66afaed6f35b5b738090122c3105cf14b6eebdecbafa9eb1eb23e414347385","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1b66afaed6f35b5b738090122c3105cf14b6eebdecbafa9eb1eb23e414347385","first_computed_at":"2026-07-05T08:49:34.822451Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-07-05T08:49:34.822451Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"EK4qEhtyCHz6AcvSjN/er9F/QVFt5LD5MILiNRK9/hUfXo235VYeit8Yz5eQIakZ8hLu9NYg/qtDsWyiMWPzAQ==","signature_status":"signed_v1","signed_at":"2026-07-05T08:49:34.822966Z","signed_message":"canonical_sha256_bytes"},"source_id":"2405.17430","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:3b9c7aabac2c5b3e666af69cc2c0165dc7356b4a4bed76ad63b60fda994d4f08","sha256:109046490b77bf51b72242d8ebe72654a89a18b4bc0ed9877635c895a8ad3fb6"],"state_sha256":"5564c89d2dac42426a0b062e9b5c930391c179cc5feeeac43c06edd97d22416e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"E1jaELWAy7k7CGIEcHLgLHKDTuKE8F88n7NU2ruMH4yVhfGUXqwczUJnDON4EpYg7CE3uTyaCLMfPyWOLwtpCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-05T15:51:01.178801Z","bundle_sha256":"b3b56eff6fb453eabbf22630b8bf690e93ad129213013e20ac1bc2886ccee620"}}