{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:E3OGLHY2ACDJJNDR2QAXNUJ3AZ","short_pith_number":"pith:E3OGLHY2","canonical_record":{"source":{"id":"2504.21561","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-04-30T12:01:27Z","cross_cats_sorted":[],"title_canon_sha256":"0c9e93d6454d4340f5f5fb6769d36f24715fd7490e0c25c117f6768e4bd77006","abstract_canon_sha256":"ffa69b38435c0f8f4898f9c2c4f1cf5f9e43baa17ed1eccd922cb580ff2637c5"},"schema_version":"1.0"},"canonical_sha256":"26dc659f1a008694b471d40176d13b0645e495b809ee7ceb3159c36490e8f561","source":{"kind":"arxiv","id":"2504.21561","version":5},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.21561","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"arxiv_version","alias_value":"2504.21561v5","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.21561","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_12","alias_value":"E3OGLHY2ACDJ","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_16","alias_value":"E3OGLHY2ACDJJNDR","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_8","alias_value":"E3OGLHY2","created_at":"2026-06-12T01:09:08Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:E3OGLHY2ACDJJNDR2QAXNUJ3AZ","target":"record","payload":{"canonical_record":{"source":{"id":"2504.21561","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-04-30T12:01:27Z","cross_cats_sorted":[],"title_canon_sha256":"0c9e93d6454d4340f5f5fb6769d36f24715fd7490e0c25c117f6768e4bd77006","abstract_canon_sha256":"ffa69b38435c0f8f4898f9c2c4f1cf5f9e43baa17ed1eccd922cb580ff2637c5"},"schema_version":"1.0"},"canonical_sha256":"26dc659f1a008694b471d40176d13b0645e495b809ee7ceb3159c36490e8f561","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-12T01:09:08.562199Z","signature_b64":"J5SuSQMmAGRoXbMa2ifW9Mu3GxiGGyFaBguCkrybmV57HM9lWBtjRVFZ1bS5N+YG+c8qB0HZtdii4/M5Uv/DAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"26dc659f1a008694b471d40176d13b0645e495b809ee7ceb3159c36490e8f561","last_reissued_at":"2026-06-12T01:09:08.561114Z","signature_status":"signed_v1","first_computed_at":"2026-06-12T01:09:08.561114Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2504.21561","source_version":5,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-12T01:09:08Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xStOAaWHI7fLPy17sA4HGGUJtTZ5YfrwZB9Eo4nf/cdhpCpCuTa538ZcNMuclMzeHUMhTEDmNrjTokgo2N5TAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-27T21:15:56.082117Z"},"content_sha256":"dd248e5f319177135f0ffafb3fe7c76c330a76656fa57c395f943a40abbb2bbe","schema_version":"1.0","event_id":"sha256:dd248e5f319177135f0ffafb3fe7c76c330a76656fa57c395f943a40abbb2bbe"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:E3OGLHY2ACDJJNDR2QAXNUJ3AZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Iterative Tool Usage Exploration for Multimodal Agents via Step-wise Preference Tuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bofei Zhang, Chenrui Shi, Pengxiang Li, Qing Li, Song-Chun Zhu, Tao Yuan, Xiaojian Ma, Yapeng Mi, Yunde Jia, Yuwei Wu, Zhi Gao","submitted_at":"2025-04-30T12:01:27Z","abstract_excerpt":"Multimodal agents, which integrate a controller e.g., a vision language model) with external tools, have demonstrated remarkable capabilities in tackling complex multimodal tasks. Existing approaches for training these agents, both supervised fine-tuning and reinforcement learning, depend on extensive human-annotated task-answer pairs and tool trajectories. However, for complex multimodal tasks, such annotations are prohibitively expensive or impractical to obtain. In this paper, we propose an iterative tool usage exploration method for multimodal agents without any pre-collected data, namely "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.21561","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2504.21561/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-12T01:09:08Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J6RZ+gItyQLVD5to9XnDmJFJ34Xi7gZmQHKb4+rC1ApSc5kGybuirilW6unbkwZe9yQTlP8hIrEQ/D24kdIaBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-27T21:15:56.082503Z"},"content_sha256":"83694afd0517cbe93c4f69f6a1f013f85a07a237d592349afb42b6b5ff53654b","schema_version":"1.0","event_id":"sha256:83694afd0517cbe93c4f69f6a1f013f85a07a237d592349afb42b6b5ff53654b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/bundle.json","state_url":"https://pith.science/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-27T21:15:56Z","links":{"resolver":"https://pith.science/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ","bundle":"https://pith.science/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/bundle.json","state":"https://pith.science/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/E3OGLHY2ACDJJNDR2QAXNUJ3AZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:E3OGLHY2ACDJJNDR2QAXNUJ3AZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ffa69b38435c0f8f4898f9c2c4f1cf5f9e43baa17ed1eccd922cb580ff2637c5","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-04-30T12:01:27Z","title_canon_sha256":"0c9e93d6454d4340f5f5fb6769d36f24715fd7490e0c25c117f6768e4bd77006"},"schema_version":"1.0","source":{"id":"2504.21561","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2504.21561","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"arxiv_version","alias_value":"2504.21561v5","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.21561","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_12","alias_value":"E3OGLHY2ACDJ","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_16","alias_value":"E3OGLHY2ACDJJNDR","created_at":"2026-06-12T01:09:08Z"},{"alias_kind":"pith_short_8","alias_value":"E3OGLHY2","created_at":"2026-06-12T01:09:08Z"}],"graph_snapshots":[{"event_id":"sha256:83694afd0517cbe93c4f69f6a1f013f85a07a237d592349afb42b6b5ff53654b","target":"graph","created_at":"2026-06-12T01:09:08Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2504.21561/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Multimodal agents, which integrate a controller e.g., a vision language model) with external tools, have demonstrated remarkable capabilities in tackling complex multimodal tasks. Existing approaches for training these agents, both supervised fine-tuning and reinforcement learning, depend on extensive human-annotated task-answer pairs and tool trajectories. However, for complex multimodal tasks, such annotations are prohibitively expensive or impractical to obtain. In this paper, we propose an iterative tool usage exploration method for multimodal agents without any pre-collected data, namely ","authors_text":"Bofei Zhang, Chenrui Shi, Pengxiang Li, Qing Li, Song-Chun Zhu, Tao Yuan, Xiaojian Ma, Yapeng Mi, Yunde Jia, Yuwei Wu, Zhi Gao","cross_cats":[],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-04-30T12:01:27Z","title":"Iterative Tool Usage Exploration for Multimodal Agents via Step-wise Preference Tuning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.21561","kind":"arxiv","version":5},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:dd248e5f319177135f0ffafb3fe7c76c330a76656fa57c395f943a40abbb2bbe","target":"record","created_at":"2026-06-12T01:09:08Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ffa69b38435c0f8f4898f9c2c4f1cf5f9e43baa17ed1eccd922cb580ff2637c5","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-04-30T12:01:27Z","title_canon_sha256":"0c9e93d6454d4340f5f5fb6769d36f24715fd7490e0c25c117f6768e4bd77006"},"schema_version":"1.0","source":{"id":"2504.21561","kind":"arxiv","version":5}},"canonical_sha256":"26dc659f1a008694b471d40176d13b0645e495b809ee7ceb3159c36490e8f561","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"26dc659f1a008694b471d40176d13b0645e495b809ee7ceb3159c36490e8f561","first_computed_at":"2026-06-12T01:09:08.561114Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-12T01:09:08.561114Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"J5SuSQMmAGRoXbMa2ifW9Mu3GxiGGyFaBguCkrybmV57HM9lWBtjRVFZ1bS5N+YG+c8qB0HZtdii4/M5Uv/DAw==","signature_status":"signed_v1","signed_at":"2026-06-12T01:09:08.562199Z","signed_message":"canonical_sha256_bytes"},"source_id":"2504.21561","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:dd248e5f319177135f0ffafb3fe7c76c330a76656fa57c395f943a40abbb2bbe","sha256:83694afd0517cbe93c4f69f6a1f013f85a07a237d592349afb42b6b5ff53654b"],"state_sha256":"06af3f51f99cfba30ebce834ca82ee255fea3a36d209165c078418d2f54f5070"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"mgNb/9z298kocVRgCFWARImOXS3KMyyGFmCawgcj2grNy1P1NSGvT4zR0DcqJBi6ATxlpYeCvemhWyymAWsECA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-27T21:15:56.084485Z","bundle_sha256":"b9a1c05558e534318e7269d7d15881e6f95d91ac8104ec44b3ad63194cdc03c0"}}