{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2017:IG7ZEQJHMXLKWYIST6PO54W73S","short_pith_number":"pith:IG7ZEQJH","canonical_record":{"source":{"id":"1709.03376","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-11T13:44:30Z","cross_cats_sorted":[],"title_canon_sha256":"028bbefee9299aef319c093be61424b8b15c487e44eb59a212337ba9ef323e04","abstract_canon_sha256":"7b9bccce22986560e232e1bf05137d9ed44dc8e7f0e4ce29e7e20c3e6cbe06a0"},"schema_version":"1.0"},"canonical_sha256":"41bf92412765d6ab61129f9eeef2dfdcb5f68d51e58c8a38b9c96929c0d60525","source":{"kind":"arxiv","id":"1709.03376","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1709.03376","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"arxiv_version","alias_value":"1709.03376v3","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1709.03376","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"pith_short_12","alias_value":"IG7ZEQJHMXLK","created_at":"2026-05-18T12:31:21Z"},{"alias_kind":"pith_short_16","alias_value":"IG7ZEQJHMXLKWYIS","created_at":"2026-05-18T12:31:21Z"},{"alias_kind":"pith_short_8","alias_value":"IG7ZEQJH","created_at":"2026-05-18T12:31:21Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2017:IG7ZEQJHMXLKWYIST6PO54W73S","target":"record","payload":{"canonical_record":{"source":{"id":"1709.03376","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-11T13:44:30Z","cross_cats_sorted":[],"title_canon_sha256":"028bbefee9299aef319c093be61424b8b15c487e44eb59a212337ba9ef323e04","abstract_canon_sha256":"7b9bccce22986560e232e1bf05137d9ed44dc8e7f0e4ce29e7e20c3e6cbe06a0"},"schema_version":"1.0"},"canonical_sha256":"41bf92412765d6ab61129f9eeef2dfdcb5f68d51e58c8a38b9c96929c0d60525","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:21:04.718733Z","signature_b64":"nh4RL1mOT7eta8C4y1946FdumjUl0h8BT2KKn635lQPpAUWpOk3huXICg1BDL25jsnP88fNKCcEgonMDjGnvAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"41bf92412765d6ab61129f9eeef2dfdcb5f68d51e58c8a38b9c96929c0d60525","last_reissued_at":"2026-05-18T00:21:04.718200Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:21:04.718200Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"1709.03376","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:21:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"D1/6Y/F/kN4fnUT48reeJO+cWg+id3QUfYjBUK7+Xq1SJGiJOFFujNQ/qPAfmO2Wo0ddhMiLWgSJ1dSwqdd1BQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T05:08:45.105970Z"},"content_sha256":"fce6c8ae539d23793ab2ef078b170e6a1cfe335777bb620ed49c55183fb80011","schema_version":"1.0","event_id":"sha256:fce6c8ae539d23793ab2ef078b170e6a1cfe335777bb620ed49c55183fb80011"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2017:IG7ZEQJHMXLKWYIST6PO54W73S","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Stack-Captioning: Coarse-to-Fine Learning for Image Captioning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Gang Wang, Jianfei Cai, Jiuxiang Gu, Tsuhan Chen","submitted_at":"2017-09-11T13:44:30Z","abstract_excerpt":"The existing image captioning approaches typically train a one-stage sentence decoder, which is difficult to generate rich fine-grained descriptions. On the other hand, multi-stage image caption model is hard to train due to the vanishing gradient problem. In this paper, we propose a coarse-to-fine multi-stage prediction framework for image captioning, composed of multiple decoders each of which operates on the output of the previous stage, producing increasingly refined image descriptions. Our proposed learning approach addresses the difficulty of vanishing gradients during training by provid"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1709.03376","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T00:21:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OEIChTz/Jwn9lBH2clf5gQOj2OaD9LRA9RpqtVuoMZ9f64rXdPTRdX17FK6LMa7qVfii8/Ok/HDNpLSyoS2DAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T05:08:45.106703Z"},"content_sha256":"83dffe51519f29fbcbd54d6804a4132f935ae00c09ceec233b2f6a98838eabe1","schema_version":"1.0","event_id":"sha256:83dffe51519f29fbcbd54d6804a4132f935ae00c09ceec233b2f6a98838eabe1"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/IG7ZEQJHMXLKWYIST6PO54W73S/bundle.json","state_url":"https://pith.science/pith/IG7ZEQJHMXLKWYIST6PO54W73S/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/IG7ZEQJHMXLKWYIST6PO54W73S/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T05:08:45Z","links":{"resolver":"https://pith.science/pith/IG7ZEQJHMXLKWYIST6PO54W73S","bundle":"https://pith.science/pith/IG7ZEQJHMXLKWYIST6PO54W73S/bundle.json","state":"https://pith.science/pith/IG7ZEQJHMXLKWYIST6PO54W73S/state.json","well_known_bundle":"https://pith.science/.well-known/pith/IG7ZEQJHMXLKWYIST6PO54W73S/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2017:IG7ZEQJHMXLKWYIST6PO54W73S","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7b9bccce22986560e232e1bf05137d9ed44dc8e7f0e4ce29e7e20c3e6cbe06a0","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-11T13:44:30Z","title_canon_sha256":"028bbefee9299aef319c093be61424b8b15c487e44eb59a212337ba9ef323e04"},"schema_version":"1.0","source":{"id":"1709.03376","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"1709.03376","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"arxiv_version","alias_value":"1709.03376v3","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1709.03376","created_at":"2026-05-18T00:21:04Z"},{"alias_kind":"pith_short_12","alias_value":"IG7ZEQJHMXLK","created_at":"2026-05-18T12:31:21Z"},{"alias_kind":"pith_short_16","alias_value":"IG7ZEQJHMXLKWYIS","created_at":"2026-05-18T12:31:21Z"},{"alias_kind":"pith_short_8","alias_value":"IG7ZEQJH","created_at":"2026-05-18T12:31:21Z"}],"graph_snapshots":[{"event_id":"sha256:83dffe51519f29fbcbd54d6804a4132f935ae00c09ceec233b2f6a98838eabe1","target":"graph","created_at":"2026-05-18T00:21:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"The existing image captioning approaches typically train a one-stage sentence decoder, which is difficult to generate rich fine-grained descriptions. On the other hand, multi-stage image caption model is hard to train due to the vanishing gradient problem. In this paper, we propose a coarse-to-fine multi-stage prediction framework for image captioning, composed of multiple decoders each of which operates on the output of the previous stage, producing increasingly refined image descriptions. Our proposed learning approach addresses the difficulty of vanishing gradients during training by provid","authors_text":"Gang Wang, Jianfei Cai, Jiuxiang Gu, Tsuhan Chen","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-11T13:44:30Z","title":"Stack-Captioning: Coarse-to-Fine Learning for Image Captioning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1709.03376","kind":"arxiv","version":3},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:fce6c8ae539d23793ab2ef078b170e6a1cfe335777bb620ed49c55183fb80011","target":"record","created_at":"2026-05-18T00:21:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7b9bccce22986560e232e1bf05137d9ed44dc8e7f0e4ce29e7e20c3e6cbe06a0","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2017-09-11T13:44:30Z","title_canon_sha256":"028bbefee9299aef319c093be61424b8b15c487e44eb59a212337ba9ef323e04"},"schema_version":"1.0","source":{"id":"1709.03376","kind":"arxiv","version":3}},"canonical_sha256":"41bf92412765d6ab61129f9eeef2dfdcb5f68d51e58c8a38b9c96929c0d60525","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"41bf92412765d6ab61129f9eeef2dfdcb5f68d51e58c8a38b9c96929c0d60525","first_computed_at":"2026-05-18T00:21:04.718200Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T00:21:04.718200Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nh4RL1mOT7eta8C4y1946FdumjUl0h8BT2KKn635lQPpAUWpOk3huXICg1BDL25jsnP88fNKCcEgonMDjGnvAg==","signature_status":"signed_v1","signed_at":"2026-05-18T00:21:04.718733Z","signed_message":"canonical_sha256_bytes"},"source_id":"1709.03376","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:fce6c8ae539d23793ab2ef078b170e6a1cfe335777bb620ed49c55183fb80011","sha256:83dffe51519f29fbcbd54d6804a4132f935ae00c09ceec233b2f6a98838eabe1"],"state_sha256":"fb25c3b33af94af192037beaabe326c078c183852e39ddec9b8431ef718907ac"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"+er4/TZudXh30uZYZdQ+MLLMY03l4SK9bP+8dP5hYRzc6x+rbHus5IAH/IPTYFEnioclj3vpS5CUpNGwAK3eBA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T05:08:45.110467Z","bundle_sha256":"725ff4a031e0f1c15856fc28d49e18529e496c503d6bb56353f984bcca3da393"}}