{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:NQB6XJHGWOB4IBL3CW7Q6C7TAX","short_pith_number":"pith:NQB6XJHG","canonical_record":{"source":{"id":"2507.16815","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46Z","cross_cats_sorted":["cs.AI","cs.LG","cs.RO"],"title_canon_sha256":"e1e2597472783ffaa4622feef54983ae735b3d47f340f4c8f51fa4811149b9cf","abstract_canon_sha256":"3e17ac51568322d07c2fafa17ded32be7cc4cf30c9accd85b43b4c03f6972f83"},"schema_version":"1.0"},"canonical_sha256":"6c03eba4e6b383c4057b15bf0f0bf305c3694dd8a4e9d084b72b5ea4f57a001c","source":{"kind":"arxiv","id":"2507.16815","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.16815","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2507.16815v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.16815","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"NQB6XJHGWOB4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NQB6XJHGWOB4IBL3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NQB6XJHG","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:NQB6XJHGWOB4IBL3CW7Q6C7TAX","target":"record","payload":{"canonical_record":{"source":{"id":"2507.16815","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46Z","cross_cats_sorted":["cs.AI","cs.LG","cs.RO"],"title_canon_sha256":"e1e2597472783ffaa4622feef54983ae735b3d47f340f4c8f51fa4811149b9cf","abstract_canon_sha256":"3e17ac51568322d07c2fafa17ded32be7cc4cf30c9accd85b43b4c03f6972f83"},"schema_version":"1.0"},"canonical_sha256":"6c03eba4e6b383c4057b15bf0f0bf305c3694dd8a4e9d084b72b5ea4f57a001c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.219016Z","signature_b64":"/1E9WA5ruyvqPGhZ/ntTpfydVyKe9Oa1FvZ0l+y050b6VWMbHWNFbjhfIVv1hcX5RCgQJnpQcmutbujOQOoWAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6c03eba4e6b383c4057b15bf0f0bf305c3694dd8a4e9d084b72b5ea4f57a001c","last_reissued_at":"2026-05-17T23:38:49.218542Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.218542Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2507.16815","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ien6l4rTahnGz/lauaVRHRxMtocXZgKOujChWRRkAtxIPicojNDuJsCdwaA/OLeNpKqROmiMeymhb6X6kH28Dw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T19:10:57.947256Z"},"content_sha256":"83fa357a8827829396957976da68233f70986cfe6776173e8ada7e1b8cfcf35e","schema_version":"1.0","event_id":"sha256:83fa357a8827829396957976da68233f70986cfe6776173e8ada7e1b8cfcf35e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:NQB6XJHGWOB4IBL3CW7Q6C7TAX","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.LG","cs.RO"],"primary_cat":"cs.CV","authors_text":"Chi-Pin Huang, Fu-En Yang, Min-Hung Chen, Yu-Chiang Frank Wang, Yueh-Hua Wu","submitted_at":"2025-07-22T17:59:46Z","abstract_excerpt":"Vision-language-action (VLA) reasoning tasks require agents to interpret multimodal instructions, perform long-horizon planning, and act adaptively in dynamic environments. Existing approaches typically train VLA models in an end-to-end fashion, directly mapping inputs to actions without explicit reasoning, which hinders their ability to plan over multiple steps or adapt to complex task variations. In this paper, we propose ThinkAct, a dual-system framework that bridges high-level reasoning with low-level action execution via reinforced visual latent planning. ThinkAct trains a multimodal LLM "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2507.16815","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"f1oinjv/Uu1ClzhJ6YzbXYmTroIl2+JxHGUP+4CC8XOM4X20dqRkAbXYlC5Uxbp/QxBpZt5AOMUr2b0uW665DQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T19:10:57.947613Z"},"content_sha256":"d738994744bb37be0f2f95764a00910d59b4ed541a1c3e41959770d5101e7fd4","schema_version":"1.0","event_id":"sha256:d738994744bb37be0f2f95764a00910d59b4ed541a1c3e41959770d5101e7fd4"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/bundle.json","state_url":"https://pith.science/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-03T19:10:57Z","links":{"resolver":"https://pith.science/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX","bundle":"https://pith.science/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/bundle.json","state":"https://pith.science/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NQB6XJHGWOB4IBL3CW7Q6C7TAX/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:NQB6XJHGWOB4IBL3CW7Q6C7TAX","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3e17ac51568322d07c2fafa17ded32be7cc4cf30c9accd85b43b4c03f6972f83","cross_cats_sorted":["cs.AI","cs.LG","cs.RO"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46Z","title_canon_sha256":"e1e2597472783ffaa4622feef54983ae735b3d47f340f4c8f51fa4811149b9cf"},"schema_version":"1.0","source":{"id":"2507.16815","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2507.16815","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2507.16815v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.16815","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"NQB6XJHGWOB4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NQB6XJHGWOB4IBL3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NQB6XJHG","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:d738994744bb37be0f2f95764a00910d59b4ed541a1c3e41959770d5101e7fd4","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Vision-language-action (VLA) reasoning tasks require agents to interpret multimodal instructions, perform long-horizon planning, and act adaptively in dynamic environments. Existing approaches typically train VLA models in an end-to-end fashion, directly mapping inputs to actions without explicit reasoning, which hinders their ability to plan over multiple steps or adapt to complex task variations. In this paper, we propose ThinkAct, a dual-system framework that bridges high-level reasoning with low-level action execution via reinforced visual latent planning. ThinkAct trains a multimodal LLM ","authors_text":"Chi-Pin Huang, Fu-En Yang, Min-Hung Chen, Yu-Chiang Frank Wang, Yueh-Hua Wu","cross_cats":["cs.AI","cs.LG","cs.RO"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46Z","title":"ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2507.16815","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:83fa357a8827829396957976da68233f70986cfe6776173e8ada7e1b8cfcf35e","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3e17ac51568322d07c2fafa17ded32be7cc4cf30c9accd85b43b4c03f6972f83","cross_cats_sorted":["cs.AI","cs.LG","cs.RO"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-07-22T17:59:46Z","title_canon_sha256":"e1e2597472783ffaa4622feef54983ae735b3d47f340f4c8f51fa4811149b9cf"},"schema_version":"1.0","source":{"id":"2507.16815","kind":"arxiv","version":2}},"canonical_sha256":"6c03eba4e6b383c4057b15bf0f0bf305c3694dd8a4e9d084b72b5ea4f57a001c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6c03eba4e6b383c4057b15bf0f0bf305c3694dd8a4e9d084b72b5ea4f57a001c","first_computed_at":"2026-05-17T23:38:49.218542Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.218542Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"/1E9WA5ruyvqPGhZ/ntTpfydVyKe9Oa1FvZ0l+y050b6VWMbHWNFbjhfIVv1hcX5RCgQJnpQcmutbujOQOoWAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.219016Z","signed_message":"canonical_sha256_bytes"},"source_id":"2507.16815","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:83fa357a8827829396957976da68233f70986cfe6776173e8ada7e1b8cfcf35e","sha256:d738994744bb37be0f2f95764a00910d59b4ed541a1c3e41959770d5101e7fd4"],"state_sha256":"74d4ff213f91b8b4dd75526920ceefabc4d9410c93e5b214c2eb31a5be60a363"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"izaK7pxE/dss/kbUYvf0mWFKWQUFWviLvf2dnsBW5CpZ65D7dpU6RCcudthRLz8atZAAPjhvr01rR70xsHxuDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-03T19:10:57.949597Z","bundle_sha256":"24ff9c5eb9c3028bd463a237dc42955a61030fec1dd74002b56d1edbfe131bed"}}