{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:XT6F5IMD6ZUONLIGHCFRZTO32U","short_pith_number":"pith:XT6F5IMD","canonical_record":{"source":{"id":"2605.26656","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T07:41:48Z","cross_cats_sorted":[],"title_canon_sha256":"f68fd05ae1d927ca3bc1fd61907e5a625823abcb843587d2b008e4ded3023c54","abstract_canon_sha256":"4d3f7117fb34bbf7181e89865c0a602d59c603180576cbcc809f046a7636c9e4"},"schema_version":"1.0"},"canonical_sha256":"bcfc5ea183f668e6ad06388b1ccddbd532c12496b36fbceed2c9f27067ceefe4","source":{"kind":"arxiv","id":"2605.26656","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26656","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26656v1","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26656","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_12","alias_value":"XT6F5IMD6ZUO","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_16","alias_value":"XT6F5IMD6ZUONLIG","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_8","alias_value":"XT6F5IMD","created_at":"2026-05-27T01:06:04Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:XT6F5IMD6ZUONLIGHCFRZTO32U","target":"record","payload":{"canonical_record":{"source":{"id":"2605.26656","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T07:41:48Z","cross_cats_sorted":[],"title_canon_sha256":"f68fd05ae1d927ca3bc1fd61907e5a625823abcb843587d2b008e4ded3023c54","abstract_canon_sha256":"4d3f7117fb34bbf7181e89865c0a602d59c603180576cbcc809f046a7636c9e4"},"schema_version":"1.0"},"canonical_sha256":"bcfc5ea183f668e6ad06388b1ccddbd532c12496b36fbceed2c9f27067ceefe4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:06:04.328432Z","signature_b64":"qXritU6FNhhBi3zCHkyj78jvVR0HhMOprZnCTwypCizuwPM5DPkeONpfQixp5nzT2FcGBl9LOhDDmcyZijfRBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bcfc5ea183f668e6ad06388b1ccddbd532c12496b36fbceed2c9f27067ceefe4","last_reissued_at":"2026-05-27T01:06:04.327615Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:06:04.327615Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.26656","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:06:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UnUgts6swPL6Z+QQxnIHZx7MNud1Lx0HriSoqOJuvStwNR8FZAs0QxSqGt/9fKqdkkwFt4jLbFZIgmZDr648AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T15:27:13.116252Z"},"content_sha256":"96057dbb355b4576e12620b0cb2adca3c87c857c1492299ce6b8b46dac6f45b0","schema_version":"1.0","event_id":"sha256:96057dbb355b4576e12620b0cb2adca3c87c857c1492299ce6b8b46dac6f45b0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:XT6F5IMD6ZUONLIGHCFRZTO32U","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DV-SFT: Direct Vision Supervision for Fine-Grained Visual Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bing Wang, Chong Feng, Feng Zhang, Jianfei Zhao, Xin Sun, Zhixing Tan","submitted_at":"2026-05-26T07:41:48Z","abstract_excerpt":"Multimodal large language models are typically trained end-to-end to predict ground-truth answers, yet supervision signals are applied exclusively to text tokens. Visual tokens, the core carriers of visual information, are optimized only implicitly as part of the context, leading to coarse-grained visual understanding. Prior works attempt to supervise visual inputs but inevitably rely on auxiliary components such as additional decoders or forward passes, because visual tokens lack readily interpretable labels. This limits their practical applicability. In this work, we propose \\textbf{D}irect "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26656","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.26656/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:06:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5rGALgKgPaqlPOGtXlaZq/Qqbe8ybL+ZrS+k5y0HypKnZ8gh6dWw7hw5mRPQCTzcc84Cdtbv+oK5po6NXTddDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T15:27:13.117251Z"},"content_sha256":"a4352b3055571aa7c82c0718ef9a29477108c0d37fed153c4ca2ddc7e2fede0c","schema_version":"1.0","event_id":"sha256:a4352b3055571aa7c82c0718ef9a29477108c0d37fed153c4ca2ddc7e2fede0c"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/bundle.json","state_url":"https://pith.science/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T15:27:13Z","links":{"resolver":"https://pith.science/pith/XT6F5IMD6ZUONLIGHCFRZTO32U","bundle":"https://pith.science/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/bundle.json","state":"https://pith.science/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/state.json","well_known_bundle":"https://pith.science/.well-known/pith/XT6F5IMD6ZUONLIGHCFRZTO32U/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:XT6F5IMD6ZUONLIGHCFRZTO32U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4d3f7117fb34bbf7181e89865c0a602d59c603180576cbcc809f046a7636c9e4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T07:41:48Z","title_canon_sha256":"f68fd05ae1d927ca3bc1fd61907e5a625823abcb843587d2b008e4ded3023c54"},"schema_version":"1.0","source":{"id":"2605.26656","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.26656","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"arxiv_version","alias_value":"2605.26656v1","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26656","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_12","alias_value":"XT6F5IMD6ZUO","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_16","alias_value":"XT6F5IMD6ZUONLIG","created_at":"2026-05-27T01:06:04Z"},{"alias_kind":"pith_short_8","alias_value":"XT6F5IMD","created_at":"2026-05-27T01:06:04Z"}],"graph_snapshots":[{"event_id":"sha256:a4352b3055571aa7c82c0718ef9a29477108c0d37fed153c4ca2ddc7e2fede0c","target":"graph","created_at":"2026-05-27T01:06:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.26656/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Multimodal large language models are typically trained end-to-end to predict ground-truth answers, yet supervision signals are applied exclusively to text tokens. Visual tokens, the core carriers of visual information, are optimized only implicitly as part of the context, leading to coarse-grained visual understanding. Prior works attempt to supervise visual inputs but inevitably rely on auxiliary components such as additional decoders or forward passes, because visual tokens lack readily interpretable labels. This limits their practical applicability. In this work, we propose \\textbf{D}irect ","authors_text":"Bing Wang, Chong Feng, Feng Zhang, Jianfei Zhao, Xin Sun, Zhixing Tan","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T07:41:48Z","title":"DV-SFT: Direct Vision Supervision for Fine-Grained Visual Understanding"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26656","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:96057dbb355b4576e12620b0cb2adca3c87c857c1492299ce6b8b46dac6f45b0","target":"record","created_at":"2026-05-27T01:06:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4d3f7117fb34bbf7181e89865c0a602d59c603180576cbcc809f046a7636c9e4","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T07:41:48Z","title_canon_sha256":"f68fd05ae1d927ca3bc1fd61907e5a625823abcb843587d2b008e4ded3023c54"},"schema_version":"1.0","source":{"id":"2605.26656","kind":"arxiv","version":1}},"canonical_sha256":"bcfc5ea183f668e6ad06388b1ccddbd532c12496b36fbceed2c9f27067ceefe4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bcfc5ea183f668e6ad06388b1ccddbd532c12496b36fbceed2c9f27067ceefe4","first_computed_at":"2026-05-27T01:06:04.327615Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:06:04.327615Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"qXritU6FNhhBi3zCHkyj78jvVR0HhMOprZnCTwypCizuwPM5DPkeONpfQixp5nzT2FcGBl9LOhDDmcyZijfRBw==","signature_status":"signed_v1","signed_at":"2026-05-27T01:06:04.328432Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.26656","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:96057dbb355b4576e12620b0cb2adca3c87c857c1492299ce6b8b46dac6f45b0","sha256:a4352b3055571aa7c82c0718ef9a29477108c0d37fed153c4ca2ddc7e2fede0c"],"state_sha256":"7afc9f66f78405157f21aa2a5d22009f54174ab2600cf3f7b5df432def70a817"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"qA2bZzb+l8b3Vi006TGwYBm+9q7Cf1rTktKCArKCrFiJ+ooU+Z7Q6sN6ExifPxgaXO1Czoj8wrWP9ZOBzZXEDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T15:27:13.120799Z","bundle_sha256":"f08837c1f818d5b2bf2dd8ed953a24ec733b9961574a5efba082427877d15fb5"}}