{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:SUUZZ533FBXSSFHLJZGXK2NXDB","short_pith_number":"pith:SUUZZ533","canonical_record":{"source":{"id":"2603.18003","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T17:59:12Z","cross_cats_sorted":[],"title_canon_sha256":"6ee668eae94df421a3bb42bb3de036911ca2050d51e070adadb41b5de605df38","abstract_canon_sha256":"3b9be5fcea6414be906fca4d985389f559572a957794e3559b6b5a5d406063cd"},"schema_version":"1.0"},"canonical_sha256":"95299cf77b286f2914eb4e4d7569b71843ff85f1a54a34495bfb0d9bd31f3677","source":{"kind":"arxiv","id":"2603.18003","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.18003","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"arxiv_version","alias_value":"2603.18003v4","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.18003","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_12","alias_value":"SUUZZ533FBXS","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_16","alias_value":"SUUZZ533FBXSSFHL","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_8","alias_value":"SUUZZ533","created_at":"2026-05-20T01:05:11Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:SUUZZ533FBXSSFHLJZGXK2NXDB","target":"record","payload":{"canonical_record":{"source":{"id":"2603.18003","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T17:59:12Z","cross_cats_sorted":[],"title_canon_sha256":"6ee668eae94df421a3bb42bb3de036911ca2050d51e070adadb41b5de605df38","abstract_canon_sha256":"3b9be5fcea6414be906fca4d985389f559572a957794e3559b6b5a5d406063cd"},"schema_version":"1.0"},"canonical_sha256":"95299cf77b286f2914eb4e4d7569b71843ff85f1a54a34495bfb0d9bd31f3677","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:11.462074Z","signature_b64":"ep83adeNiGXgBC+rkdvIJcE/dCHuNvnTjaMLv9alsIjRt+vYgOqgiM3QfAIvyb+heq7CgORsCid3Ta1PBP2/Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"95299cf77b286f2914eb4e4d7569b71843ff85f1a54a34495bfb0d9bd31f3677","last_reissued_at":"2026-05-20T01:05:11.461264Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:11.461264Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.18003","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"36ejfsf2aBPlTgL8sYRvOHeTJ/0WnI/SNxfy6Hd18nFSLC24BRCCFHVw8vs19fiz18sgGeLv3VViZH53bI7UBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T05:21:17.488352Z"},"content_sha256":"8c344b66da47647efc573b28a4df8b71f84cc5523604b369dc0be5334f172696","schema_version":"1.0","event_id":"sha256:8c344b66da47647efc573b28a4df8b71f84cc5523604b369dc0be5334f172696"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:SUUZZ533FBXSSFHLJZGXK2NXDB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Universal Skeleton Understanding via Differentiable Rendering and MLLMs","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Kai-Kuang Ma, Mengyuan Liu, Peiming Li, Xinshun Wang, Yang Tang, Ziyi Wang","submitted_at":"2026-03-18T17:59:12Z","abstract_excerpt":"Multimodal large language models (MLLMs) exhibit strong visual-language reasoning, yet cannot process structured, non-visual data such as human skeletons. Existing methods either compress skeleton dynamics into lossy feature vectors for text alignment, or quantize motion into discrete tokens that generalize poorly across heterogeneous skeleton formats. We present SkeletonLLM, which achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality. At its core is DrAction, a differentiable, format-agnostic renderer that converts skeleta"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"SkeletonLLM achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality via DrAction, with cooperative training enabling strong generalization in open-vocabulary action recognition and extension to motion captioning and QA across heterogeneous formats.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That converting skeleton kinematics into compact image sequences via differentiable rendering preserves all task-relevant information without significant loss and that MLLM gradients can meaningfully guide the renderer to produce informative visual tokens.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SkeletonLLM translates arbitrary skeleton sequences into visual image sequences via a differentiable renderer DrAction, allowing MLLMs to perform open-vocabulary action recognition, captioning, and QA across heterogeneous skeleton formats.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a1a7287c349cf18b177115654018d41d57dd64a805249d04224b45eb1cef5d75"},"source":{"id":"2603.18003","kind":"arxiv","version":4},"verdict":{"id":"9db5273f-a787-4710-8c71-db0077b2e6a5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T09:21:07.795431Z","strongest_claim":"SkeletonLLM achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality via DrAction, with cooperative training enabling strong generalization in open-vocabulary action recognition and extension to motion captioning and QA across heterogeneous formats.","one_line_summary":"SkeletonLLM translates arbitrary skeleton sequences into visual image sequences via a differentiable renderer DrAction, allowing MLLMs to perform open-vocabulary action recognition, captioning, and QA across heterogeneous skeleton formats.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That converting skeleton kinematics into compact image sequences via differentiable rendering preserves all task-relevant information without significant loss and that MLLM gradients can meaningfully guide the renderer to produce informative visual tokens.","pith_extraction_headline":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.18003/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"89a3dfebba08759828aa285acdfef94ed96da6269c1ec6ec76ef8377fbe28e7c"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9db5273f-a787-4710-8c71-db0077b2e6a5"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YN1TowwhEoAX1fnlj3qwnx7ZQK8SRiEyxeVZmlgDfwHprMM2JoXhNXQwcKYgfFIazfvn9mT2DAQBqJet/2xhBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T05:21:17.489252Z"},"content_sha256":"9b489ee2e9d64d65cb88ad213210f41b753b0651a96378a7061f2d36b1035783","schema_version":"1.0","event_id":"sha256:9b489ee2e9d64d65cb88ad213210f41b753b0651a96378a7061f2d36b1035783"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/bundle.json","state_url":"https://pith.science/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T05:21:17Z","links":{"resolver":"https://pith.science/pith/SUUZZ533FBXSSFHLJZGXK2NXDB","bundle":"https://pith.science/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/bundle.json","state":"https://pith.science/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/SUUZZ533FBXSSFHLJZGXK2NXDB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:SUUZZ533FBXSSFHLJZGXK2NXDB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3b9be5fcea6414be906fca4d985389f559572a957794e3559b6b5a5d406063cd","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T17:59:12Z","title_canon_sha256":"6ee668eae94df421a3bb42bb3de036911ca2050d51e070adadb41b5de605df38"},"schema_version":"1.0","source":{"id":"2603.18003","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.18003","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"arxiv_version","alias_value":"2603.18003v4","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.18003","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_12","alias_value":"SUUZZ533FBXS","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_16","alias_value":"SUUZZ533FBXSSFHL","created_at":"2026-05-20T01:05:11Z"},{"alias_kind":"pith_short_8","alias_value":"SUUZZ533","created_at":"2026-05-20T01:05:11Z"}],"graph_snapshots":[{"event_id":"sha256:9b489ee2e9d64d65cb88ad213210f41b753b0651a96378a7061f2d36b1035783","target":"graph","created_at":"2026-05-20T01:05:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"SkeletonLLM achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality via DrAction, with cooperative training enabling strong generalization in open-vocabulary action recognition and extension to motion captioning and QA across heterogeneous formats."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That converting skeleton kinematics into compact image sequences via differentiable rendering preserves all task-relevant information without significant loss and that MLLM gradients can meaningfully guide the renderer to produce informative visual tokens."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SkeletonLLM translates arbitrary skeleton sequences into visual image sequences via a differentiable renderer DrAction, allowing MLLMs to perform open-vocabulary action recognition, captioning, and QA across heterogeneous skeleton formats."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly."}],"snapshot_sha256":"a1a7287c349cf18b177115654018d41d57dd64a805249d04224b45eb1cef5d75"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"89a3dfebba08759828aa285acdfef94ed96da6269c1ec6ec76ef8377fbe28e7c"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.18003/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Multimodal large language models (MLLMs) exhibit strong visual-language reasoning, yet cannot process structured, non-visual data such as human skeletons. Existing methods either compress skeleton dynamics into lossy feature vectors for text alignment, or quantize motion into discrete tokens that generalize poorly across heterogeneous skeleton formats. We present SkeletonLLM, which achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality. At its core is DrAction, a differentiable, format-agnostic renderer that converts skeleta","authors_text":"Kai-Kuang Ma, Mengyuan Liu, Peiming Li, Xinshun Wang, Yang Tang, Ziyi Wang","cross_cats":[],"headline":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T17:59:12Z","title":"Universal Skeleton Understanding via Differentiable Rendering and MLLMs"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.18003","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T09:21:07.795431Z","id":"9db5273f-a787-4710-8c71-db0077b2e6a5","model_set":{"reader":"grok-4.3"},"one_line_summary":"SkeletonLLM translates arbitrary skeleton sequences into visual image sequences via a differentiable renderer DrAction, allowing MLLMs to perform open-vocabulary action recognition, captioning, and QA across heterogeneous skeleton formats.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Differentiable rendering converts arbitrary skeleton sequences into images that MLLMs can process directly.","strongest_claim":"SkeletonLLM achieves universal skeleton understanding by translating arbitrary skeleton sequences into the MLLM's native visual modality via DrAction, with cooperative training enabling strong generalization in open-vocabulary action recognition and extension to motion captioning and QA across heterogeneous formats.","weakest_assumption":"That converting skeleton kinematics into compact image sequences via differentiable rendering preserves all task-relevant information without significant loss and that MLLM gradients can meaningfully guide the renderer to produce informative visual tokens."}},"verdict_id":"9db5273f-a787-4710-8c71-db0077b2e6a5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:8c344b66da47647efc573b28a4df8b71f84cc5523604b369dc0be5334f172696","target":"record","created_at":"2026-05-20T01:05:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3b9be5fcea6414be906fca4d985389f559572a957794e3559b6b5a5d406063cd","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2026-03-18T17:59:12Z","title_canon_sha256":"6ee668eae94df421a3bb42bb3de036911ca2050d51e070adadb41b5de605df38"},"schema_version":"1.0","source":{"id":"2603.18003","kind":"arxiv","version":4}},"canonical_sha256":"95299cf77b286f2914eb4e4d7569b71843ff85f1a54a34495bfb0d9bd31f3677","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"95299cf77b286f2914eb4e4d7569b71843ff85f1a54a34495bfb0d9bd31f3677","first_computed_at":"2026-05-20T01:05:11.461264Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:11.461264Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ep83adeNiGXgBC+rkdvIJcE/dCHuNvnTjaMLv9alsIjRt+vYgOqgiM3QfAIvyb+heq7CgORsCid3Ta1PBP2/Ag==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:11.462074Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.18003","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:8c344b66da47647efc573b28a4df8b71f84cc5523604b369dc0be5334f172696","sha256:9b489ee2e9d64d65cb88ad213210f41b753b0651a96378a7061f2d36b1035783"],"state_sha256":"4d3208ba5b12c780e2e9481826a3a3d6d1ea8a3d5a423b612021f186dec0c8c2"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Q9BaXmkdR1WU+2/Cu+9A0IoSNSSU6mNpOpvEvs6D95hB9dBCnI1vV+MBN8xqvzb7y+DZEmKd2u0yXH9Zc5JgCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T05:21:17.493040Z","bundle_sha256":"af93cca52be40b038e3dcc5fdb7cdb544464aa06bea12369ea96f685ea7abf04"}}