{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:O4B2NNVEKGX7O56DOKJNSSWVV3","short_pith_number":"pith:O4B2NNVE","canonical_record":{"source":{"id":"2605.24020","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf","abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b"},"schema_version":"1.0"},"canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","source":{"kind":"arxiv","id":"2605.24020","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"arxiv_version","alias_value":"2605.24020v1","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_12","alias_value":"O4B2NNVEKGX7","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_16","alias_value":"O4B2NNVEKGX7O56D","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_8","alias_value":"O4B2NNVE","created_at":"2026-05-26T01:02:41Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:O4B2NNVEKGX7O56DOKJNSSWVV3","target":"record","payload":{"canonical_record":{"source":{"id":"2605.24020","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf","abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b"},"schema_version":"1.0"},"canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T01:02:41.436284Z","signature_b64":"7VRgLwMTDVLhXEtQ2efra158i044bnEnObrLhOUj0y7MMGYPal2BA+TRyEb9na4bl+Dga/aoQOuLqr1INwhpBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","last_reissued_at":"2026-05-26T01:02:41.434699Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T01:02:41.434699Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.24020","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T01:02:41Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"z+8CpZCQ5UDXwMQjYkvjkjeXJZhRenLtnLS0uJiFZ3NURVW2JI94fRxWFNKdrtKa9VYoiGQku14IwUIrA2iECg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T13:23:53.235685Z"},"content_sha256":"a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070","schema_version":"1.0","event_id":"sha256:a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:O4B2NNVEKGX7O56DOKJNSSWVV3","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Machine Intelligence that Understands Visual and Linguistic Information and Interacts with Humans and Environments","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Van Quang Nguyen","submitted_at":"2026-05-20T06:11:25Z","abstract_excerpt":"Advancements at the intersection of computer vision and natural language processing are crucial for applications like assistive tech, multimedia querying, and robotics. This dissertation proposes novel architectures to improve intelligent agents across three key vision-language tasks: image captioning, visual dialog, and interactive instruction following.\n  First, we address limitations in visual representation for image captioning. Traditional models rely on region-based features from CNN detectors, which lack global context and suffer from high computational overhead. We propose GRIT (Grid a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24020","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.24020/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T01:02:41Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"I3oMBqxYM3FLBTAN850M1tsT+lsikNiDkE98xMlAmrjQne2nFfrpYih/wdqZdrXDZ8JS1Rg136aajAd3aWV6CA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T13:23:53.236072Z"},"content_sha256":"d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a","schema_version":"1.0","event_id":"sha256:d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/bundle.json","state_url":"https://pith.science/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-02T13:23:53Z","links":{"resolver":"https://pith.science/pith/O4B2NNVEKGX7O56DOKJNSSWVV3","bundle":"https://pith.science/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/bundle.json","state":"https://pith.science/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/state.json","well_known_bundle":"https://pith.science/.well-known/pith/O4B2NNVEKGX7O56DOKJNSSWVV3/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:O4B2NNVEKGX7O56DOKJNSSWVV3","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf"},"schema_version":"1.0","source":{"id":"2605.24020","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"arxiv_version","alias_value":"2605.24020v1","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.24020","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_12","alias_value":"O4B2NNVEKGX7","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_16","alias_value":"O4B2NNVEKGX7O56D","created_at":"2026-05-26T01:02:41Z"},{"alias_kind":"pith_short_8","alias_value":"O4B2NNVE","created_at":"2026-05-26T01:02:41Z"}],"graph_snapshots":[{"event_id":"sha256:d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a","target":"graph","created_at":"2026-05-26T01:02:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.24020/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Advancements at the intersection of computer vision and natural language processing are crucial for applications like assistive tech, multimedia querying, and robotics. This dissertation proposes novel architectures to improve intelligent agents across three key vision-language tasks: image captioning, visual dialog, and interactive instruction following.\n  First, we address limitations in visual representation for image captioning. Traditional models rely on region-based features from CNN detectors, which lack global context and suffer from high computational overhead. We propose GRIT (Grid a","authors_text":"Van Quang Nguyen","cross_cats":["cs.AI"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title":"Machine Intelligence that Understands Visual and Linguistic Information and Interacts with Humans and Environments"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.24020","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070","target":"record","created_at":"2026-05-26T01:02:41Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6c887729ba4cb8bdf678ed317158c8b75f19f85e0cda55d2937852c6a26ba72b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T06:11:25Z","title_canon_sha256":"9733d5baaad3da6ae2974915aa342de72037d4996d1de1301ec0aec8f7f498cf"},"schema_version":"1.0","source":{"id":"2605.24020","kind":"arxiv","version":1}},"canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"7703a6b6a451aff777c37292d94ad5aee69c6067a5514f47228ebc9695f8537f","first_computed_at":"2026-05-26T01:02:41.434699Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T01:02:41.434699Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"7VRgLwMTDVLhXEtQ2efra158i044bnEnObrLhOUj0y7MMGYPal2BA+TRyEb9na4bl+Dga/aoQOuLqr1INwhpBw==","signature_status":"signed_v1","signed_at":"2026-05-26T01:02:41.436284Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.24020","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a76d50414717b9e4c661f65dfb7b97e08b662f3d88f7440e0a8c1192c84d8070","sha256:d6b9d3b46ae07e45b02f4ca263c6d462148f6eb9b620b1fbab30e183e7a6a84a"],"state_sha256":"268f2797c0bb941d7b8addd5527256a16fc1ef00d08eb612e89b6cd3d9906b90"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Ak4BPVfxv8fFk08+QMuHyheGGccQ2CsMfBJkFJVyDMCRuR+cTZAwC7S6/n9SVkGANm3Il2948r2MWQuJs9YOCg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-02T13:23:53.238139Z","bundle_sha256":"a79edc3271c0445e7b20b7bc25d600373e1c89469f3b7573741988e708881fc4"}}