{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2015:4OAQATPCTQRO3ZFHW5OFMLKXDU","short_pith_number":"pith:4OAQATPC","schema_version":"1.0","canonical_sha256":"e381004de29c22ede4a7b75c562d571d1b08b5076cbefbe7ff41b6f45f346265","source":{"kind":"arxiv","id":"1505.04474","version":1},"attestation_state":"computed","paper":{"title":"Visual Semantic Role Labeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Jitendra Malik, Saurabh Gupta","submitted_at":"2015-05-17T23:21:35Z","abstract_excerpt":"In this paper we introduce the problem of Visual Semantic Role Labeling: given an image we want to detect people doing actions and localize the objects of interaction. Classical approaches to action recognition either study the task of action classification at the image or video clip level or at best produce a bounding box around the person doing the action. We believe such an output is inadequate and a complete understanding can only come when we are able to associate objects in the scene to the different semantic roles of the action. To enable progress towards this goal, we annotate a datase"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1505.04474","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2015-05-17T23:21:35Z","cross_cats_sorted":[],"title_canon_sha256":"c31c724c604ce1198ed8d3bed438bfb1565f59efd6fdd8e533bb2bc7720adf2f","abstract_canon_sha256":"5f6daf2d7d17d13eb712929eb9284080f0769960b4d978f0e65ca3a3771dea60"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:07:25.356047Z","signature_b64":"+U+bGXxD7WmuiB+P5BkmD+EqfQaJN34Efyypax/PbQmInjAV6rI2YXoJvPKE+eO4kUN1oxkQtV3B6kNgUGklCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e381004de29c22ede4a7b75c562d571d1b08b5076cbefbe7ff41b6f45f346265","last_reissued_at":"2026-05-18T02:07:25.355632Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:07:25.355632Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Visual Semantic Role Labeling","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Jitendra Malik, Saurabh Gupta","submitted_at":"2015-05-17T23:21:35Z","abstract_excerpt":"In this paper we introduce the problem of Visual Semantic Role Labeling: given an image we want to detect people doing actions and localize the objects of interaction. Classical approaches to action recognition either study the task of action classification at the image or video clip level or at best produce a bounding box around the person doing the action. We believe such an output is inadequate and a complete understanding can only come when we are able to associate objects in the scene to the different semantic roles of the action. To enable progress towards this goal, we annotate a datase"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1505.04474","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1505.04474","created_at":"2026-05-18T02:07:25.355693+00:00"},{"alias_kind":"arxiv_version","alias_value":"1505.04474v1","created_at":"2026-05-18T02:07:25.355693+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1505.04474","created_at":"2026-05-18T02:07:25.355693+00:00"},{"alias_kind":"pith_short_12","alias_value":"4OAQATPCTQRO","created_at":"2026-05-18T12:29:05.191682+00:00"},{"alias_kind":"pith_short_16","alias_value":"4OAQATPCTQRO3ZFH","created_at":"2026-05-18T12:29:05.191682+00:00"},{"alias_kind":"pith_short_8","alias_value":"4OAQATPC","created_at":"2026-05-18T12:29:05.191682+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2502.08660","citing_title":"A Systematic Survey of Semantic Role Labeling in the Era of Pretrained Language Models","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"1908.03557","citing_title":"VisualBERT: A Simple and Performant Baseline for Vision and Language","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05057","citing_title":"ScriptHOI: Learning Scripted State Transitions for Open-Vocabulary Human-Object Interaction Detection","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05057","citing_title":"ScriptHOI: Learning Scripted State Transitions for Open-Vocabulary Human-Object Interaction Detection","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.13448","citing_title":"A Study of Failure Modes in Two-Stage Human-Object Interaction Detection","ref_index":5,"is_internal_anchor":false},{"citing_arxiv_id":"2604.18623","citing_title":"Can We Build Scene Graphs, Not Classify Them? FlowSG: Progressive Image-Conditioned Scene Graph Generation with Flow Matching","ref_index":14,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU","json":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU.json","graph_json":"https://pith.science/api/pith-number/4OAQATPCTQRO3ZFHW5OFMLKXDU/graph.json","events_json":"https://pith.science/api/pith-number/4OAQATPCTQRO3ZFHW5OFMLKXDU/events.json","paper":"https://pith.science/paper/4OAQATPC"},"agent_actions":{"view_html":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU","download_json":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU.json","view_paper":"https://pith.science/paper/4OAQATPC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1505.04474&json=true","fetch_graph":"https://pith.science/api/pith-number/4OAQATPCTQRO3ZFHW5OFMLKXDU/graph.json","fetch_events":"https://pith.science/api/pith-number/4OAQATPCTQRO3ZFHW5OFMLKXDU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU/action/storage_attestation","attest_author":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU/action/author_attestation","sign_citation":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU/action/citation_signature","submit_replication":"https://pith.science/pith/4OAQATPCTQRO3ZFHW5OFMLKXDU/action/replication_record"}},"created_at":"2026-05-18T02:07:25.355693+00:00","updated_at":"2026-05-18T02:07:25.355693+00:00"}