{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:O4B2NNVEKGX7O56DOKJNSSWVV3","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf"},"schema_version":"1.0","source":{"id":"2605.24020","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"arxiv_version","alias_value":"2605.24020v1","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_12","alias_value":"O4B2NNVEKGX7","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_16","alias_value":"O4B2NNVEKGX7O56D","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_8","alias_value":"O4B2NNVE","created_at":"2026-05-26T01:02:41Z"}],"graph_snapshots":[{"event_id":"sha256:d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a","target":"graph","created_at":"2026-05-26T01:02:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.24020/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Advancements at the intersection of computer vision and natural language processing are crucial for applications like assistive tech, multimedia querying, and robotics. This dissertation proposes novel architectures to improve intelligent agents across three key vision-language tasks: image captioning, visual dialog, and interactive instruction following.\n  First, we address limitations in visual representation for image captioning. Traditional models rely on region-based features from CNN detectors, which lack global context and suffer from high computational overhead. We propose GRIT (Grid a","authors_text":"Van Quang Nguyen","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title":"Machine Intelligence that Understands Visual and Linguistic Information and Interacts with Humans and Environments"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24020","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070","target":"record","created_at":"2026-05-26T01:02:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf"},"schema_version":"1.0","source":{"id":"2605.24020","kind":"arxiv","version":1}},"canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","first_computed_at":"2026-05-26T01:02:41.434699Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T01:02:41.434699Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"7VRgLwMTDVLhXEtQ2efra158i044bnEnObrLhOUj0y7MMGYPal2BA+TRyEb9na4bl+Dga/aoQOuLqr1INwhpBw==","signature_status":"signed_v1","signed_at":"2026-05-26T01:02:41.436284Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.24020","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070","sha256:d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a"],"state_sha256":"268f2797c0bb941d7b8addd5527256a16fc1ef00d08eb612e89b6cd3d9906b90"}