H5FDs3comms.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
  2. * Copyright by The HDF Group. *
  3. * All rights reserved. *
  4. * *
  5. * This file is part of HDF5. The full HDF5 copyright notice, including *
  6. * terms governing use, modification, and redistribution, is contained in *
  7. * the COPYING file, which can be found at the root of the source code *
  8. * distribution tree, or in https://www.hdfgroup.org/licenses. *
  9. * If you do not have access to either file, you may request a copy from *
  10. * help@hdfgroup.org. *
  11. * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
  12. /*****************************************************************************
  13. * Read-Only S3 Virtual File Driver (VFD)
  14. *
  15. * This is the header for the S3 Communications module
  16. *
  17. * ***NOT A FILE DRIVER***
  18. *
  19. * Purpose:
  20. *
  21. * - Provide structures and functions related to communicating with
  22. * Amazon S3 (Simple Storage Service).
  23. * - Abstract away the REST API (HTTP,
  24. * networked communications) behind a series of uniform function calls.
  25. * - Handle AWS4 authentication, if appropriate.
  26. * - Fail predictably in event of errors.
  27. * - Eventually, support more S3 operations, such as creating, writing to,
  28. * and removing Objects remotely.
  29. *
  30. * translates:
  31. * `read(some_file, bytes_offset, bytes_length, &dest_buffer);`
  32. * to:
  33. * ```
  34. * GET myfile HTTP/1.1
  35. * Host: somewhere.me
  36. * Range: bytes=4096-5115
  37. * ```
  38. * and places received bytes from HTTP response...
  39. * ```
  40. * HTTP/1.1 206 Partial-Content
  41. * Content-Range: 4096-5115/63239
  42. *
  43. * <bytes>
  44. * ```
  45. * ...in destination buffer.
  46. *
  47. * TODO: put documentation in a consistent place and point to it from here.
  48. *
  49. * Programmer: Jacob Smith
  50. * 2017-11-30
  51. *
  52. *****************************************************************************/
  53. #include "H5private.h" /* Generic Functions */
  54. #ifdef H5_HAVE_ROS3_VFD
  55. /* Necessary S3 headers */
  56. #include <curl/curl.h>
  57. #include <openssl/evp.h>
  58. #include <openssl/hmac.h>
  59. #include <openssl/sha.h>
  60. /*****************
  61. * PUBLIC MACROS *
  62. *****************/
  63. /* hexadecimal string of pre-computed sha256 checksum of the empty string
  64. * hex(sha256sum(""))
  65. */
  66. #define EMPTY_SHA256 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
  67. /* string length (plus null terminator)
  68. * example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_')
  69. */
  70. #define ISO8601_SIZE 17
  71. /* string length (plus null terminator)
  72. * example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT"
  73. */
  74. #define RFC7231_SIZE 30
  75. /*---------------------------------------------------------------------------
  76. *
  77. * Macro: ISO8601NOW()
  78. *
  79. * Purpose:
  80. *
  81. * write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest
  82. * e.g., "20170630T204155Z"
  83. *
  84. * wrapper for strftime()
  85. *
  86. * It is left to the programmer to check return value of
  87. * ISO8601NOW (should equal ISO8601_SIZE - 1).
  88. *
  89. *---------------------------------------------------------------------------
  90. */
  91. #define ISO8601NOW(dest, now_gm) strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm))
  92. /*---------------------------------------------------------------------------
  93. *
  94. * Macro: RFC7231NOW()
  95. *
  96. * Purpose:
  97. *
  98. * write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest
  99. * e.g., "Fri, 30 Jun 2017 20:41:55 GMT"
  100. *
  101. * wrapper for strftime()
  102. *
  103. * It is left to the programmer to check return value of
  104. * RFC7231NOW (should equal RFC7231_SIZE - 1).
  105. *
  106. *---------------------------------------------------------------------------
  107. */
  108. #define RFC7231NOW(dest, now_gm) strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm))
  109. /* Reasonable maximum length of a credential string.
  110. * Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below).
  111. * 17 <- "////aws4_request\0"
  112. * 2 < "s3" (service)
  113. * 8 <- "YYYYmmdd" (date)
  114. * 128 <- (access_id)
  115. * 155 :: sum
  116. */
  117. #define S3COMMS_MAX_CREDENTIAL_SIZE 155
  118. /*---------------------------------------------------------------------------
  119. *
  120. * Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL()
  121. *
  122. * Purpose:
  123. *
  124. * Format "S3 Credential" string from inputs, for AWS4.
  125. *
  126. * Wrapper for HDsnprintf().
  127. *
  128. * _HAS NO ERROR-CHECKING FACILITIES_
  129. * It is left to programmer to ensure that return value confers success.
  130. * e.g.,
  131. * ```
  132. * assert( S3COMMS_MAX_CREDENTIAL_SIZE >=
  133. * S3COMMS_FORMAT_CREDENTIAL(...) );
  134. * ```
  135. *
  136. * "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request"
  137. * assuming that `dest` has adequate space.
  138. *
  139. * ALL inputs must be null-terminated strings.
  140. *
  141. * `access` should be the user's access key ID.
  142. * `date` must be of format "YYYYmmdd".
  143. * `region` should be relevant AWS region, i.e. "us-east-1".
  144. * `service` should be "s3".
  145. *
  146. *---------------------------------------------------------------------------
  147. */
  148. #define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \
  149. HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE, "%s/%s/%s/%s/aws4_request", (access), (iso8601_date), \
  150. (region), (service))
  151. /*********************
  152. * PUBLIC STRUCTURES *
  153. *********************/
  154. /*----------------------------------------------------------------------------
  155. *
  156. * Structure: hrb_node_t
  157. *
  158. * HTTP Header Field Node
  159. *
  160. *
  161. *
  162. * Maintain a ordered (linked) list of HTTP Header fields.
  163. *
  164. * Provides efficient access and manipulation of a logical sequence of
  165. * HTTP header fields, of particular use when composing an
  166. * "S3 Canonical Request" for authentication.
  167. *
  168. * - The creation of a Canonical Request involves:
  169. * - convert field names to lower case
  170. * - sort by this lower-case name
  171. * - convert ": " name-value separator in HTTP string to ":"
  172. * - get sorted lowercase names without field or separator
  173. *
  174. * As HTTP headers allow headers in any order (excepting the case of multiple
  175. * headers with the same name), the list ordering can be optimized for Canonical
  176. * Request creation, suggesting alphabtical order. For more expedient insertion
  177. * and removal of elements in the list, linked list seems preferable to a
  178. * dynamically-expanding array. The usually-smaller number of entries (5 or
  179. * fewer) makes performance overhead of traversing the list trivial.
  180. *
  181. * The above requirements of creating at Canonical Request suggests a reasonable
  182. * trade-off of speed for space with the option to compute elements as needed
  183. * or to have the various elements prepared and stored in the structure
  184. * (e.g. name, value, lowername, concatenated name:value)
  185. * The structure currently is implemented to pre-compute.
  186. *
  187. * At all times, the "first" node of the list should be the least,
  188. * alphabetically. For all nodes, the `next` node should be either NULL or
  189. * of greater alphabetical value.
  190. *
  191. * Each node contains its own header field information, plus a pointer to the
  192. * next node.
  193. *
  194. * It is not allowed to have multiple nodes with the same _lowercase_ `name`s
  195. * in the same list
  196. * (i.e., name is case-insensitive for access and modification.)
  197. *
  198. * All data (`name`, `value`, `lowername`, and `cat`) are null-terminated
  199. * strings allocated specifically for their node.
  200. *
  201. *
  202. *
  203. * `magic` (unsigned long)
  204. *
  205. * "unique" idenfier number for the structure type
  206. *
  207. * `name` (char *)
  208. *
  209. * Case-meaningful name of the HTTP field.
  210. * Given case is how it is supplied to networking code.
  211. * e.g., "Range"
  212. *
  213. * `lowername` (char *)
  214. *
  215. * Lowercase copy of name.
  216. * e.g., "range"
  217. *
  218. * `value` (char *)
  219. *
  220. * Case-meaningful value of HTTP field.
  221. * e.g., "bytes=0-9"
  222. *
  223. * `cat` (char *)
  224. *
  225. * Concatenated, null-terminated string of HTTP header line,
  226. * as the field would appear in an HTTP request.
  227. * e.g., "Range: bytes=0-9"
  228. *
  229. * `next` (hrb_node_t *)
  230. *
  231. * Pointers to next node in the list, or NULL sentinel as end of list.
  232. * Next node must have a greater `lowername` as determined by strcmp().
  233. *
  234. *----------------------------------------------------------------------------
  235. */
  236. typedef struct hrb_node_t {
  237. unsigned long magic;
  238. char * name;
  239. char * value;
  240. char * cat;
  241. char * lowername;
  242. struct hrb_node_t *next;
  243. } hrb_node_t;
  244. #define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL
  245. /*----------------------------------------------------------------------------
  246. *
  247. * Structure: hrb_t
  248. *
  249. * HTTP Request Buffer structure
  250. *
  251. *
  252. *
  253. * Logically represent an HTTP request
  254. *
  255. * GET /myplace/myfile.h5 HTTP/1.1
  256. * Host: over.rainbow.oz
  257. * Date: Fri, 01 Dec 2017 12:35:04 CST
  258. *
  259. * <body>
  260. *
  261. * ...with fast, efficient access to and modification of primary and field
  262. * elements.
  263. *
  264. * Structure for building HTTP requests while hiding much of the string
  265. * processing required "under the hood."
  266. *
  267. * Information about the request target -- the first line -- and the body text,
  268. * if any, are managed directly with this structure. All header fields, e.g.,
  269. * "Host" and "Date" above, are created with a linked list of `hrb_node_t` and
  270. * included in the request by a pointer to the head of the list.
  271. *
  272. *
  273. *
  274. * `magic` (unsigned long)
  275. *
  276. * "Magic" number confirming that this is an hrb_t structure and
  277. * what operations are valid for it.
  278. *
  279. * Must be S3COMMS_HRB_MAGIC to be valid.
  280. *
  281. * `body` (char *) :
  282. *
  283. * Pointer to start of HTTP body.
  284. *
  285. * Can be NULL, in which case it is treated as the empty string, "".
  286. *
  287. * `body_len` (size_t) :
  288. *
  289. * Number of bytes (characters) in `body`. 0 if empty or NULL `body`.
  290. *
  291. * `first_header` (hrb_node_t *) :
  292. *
  293. * Pointer to first SORTED header node, if any.
  294. * It is left to the programmer to ensure that this node and associated
  295. * list is destroyed when done.
  296. *
  297. * `resource` (char *) :
  298. *
  299. * Pointer to resource URL string, e.g., "/folder/page.xhtml".
  300. *
  301. * `verb` (char *) :
  302. *
  303. * Pointer to HTTP verb string, e.g., "GET".
  304. *
  305. * `version` (char *) :
  306. *
  307. * Pointer to HTTP version string, e.g., "HTTP/1.1".
  308. *
  309. *----------------------------------------------------------------------------
  310. */
  311. typedef struct {
  312. unsigned long magic;
  313. char * body;
  314. size_t body_len;
  315. hrb_node_t * first_header;
  316. char * resource;
  317. char * verb;
  318. char * version;
  319. } hrb_t;
  320. #define S3COMMS_HRB_MAGIC 0x6DCC84UL
  321. /*----------------------------------------------------------------------------
  322. *
  323. * Structure: parsed_url_t
  324. *
  325. *
  326. * Represent a URL with easily-accessed pointers to logical elements within.
  327. * These elements (components) are stored as null-terminated strings (or just
  328. * NULLs). These components should be allocated for the structure, making the
  329. * data as safe as possible from modification. If a component is NULL, it is
  330. * either implicit in or absent from the URL.
  331. *
  332. * "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value"
  333. * ^--^ ^-----------------------^ ^--^ ^---------^ ^-------------------^
  334. * Scheme Host Port Resource Query/-ies
  335. *
  336. *
  337. *
  338. * `magic` (unsigned long)
  339. *
  340. * Structure identification and validation identifier.
  341. * Identifies as `parsed_url_t` type.
  342. *
  343. * `scheme` (char *)
  344. *
  345. * String representing which protocol is to be expected.
  346. * _Must_ be present.
  347. * "http", "https", "ftp", e.g.
  348. *
  349. * `host` (char *)
  350. *
  351. * String of host, either domain name, IPv4, or IPv6 format.
  352. * _Must_ be present.
  353. * "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]"
  354. *
  355. * `port` (char *)
  356. *
  357. * String representation of specified port. Must resolve to a valid unsigned
  358. * integer.
  359. * "9000", "80"
  360. *
  361. * `path` (char *)
  362. *
  363. * Path to resource on host. If not specified, assumes root "/".
  364. * "lollipop_guild.wav", "characters/witches/white.dat"
  365. *
  366. * `query` (char *)
  367. *
  368. * Single string of all query parameters in url (if any).
  369. * "arg1=value1&arg2=value2"
  370. *
  371. *----------------------------------------------------------------------------
  372. */
  373. typedef struct {
  374. unsigned long magic;
  375. char * scheme; /* required */
  376. char * host; /* required */
  377. char * port;
  378. char * path;
  379. char * query;
  380. } parsed_url_t;
  381. #define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL
  382. /*----------------------------------------------------------------------------
  383. *
  384. * Structure: s3r_t
  385. *
  386. *
  387. *
  388. * S3 request structure "handle".
  389. *
  390. * Holds persistent information for Amazon S3 requests.
  391. *
  392. * Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self.
  393. *
  394. * Intended to be re-used for operations on a remote object.
  395. *
  396. * Cleaned up through `H5FD_s3comms_s3r_close()`.
  397. *
  398. * _DO NOT_ share handle between threads: curl easy handle `curlhandle` has
  399. * undefined behavior if called to perform in multiple threads.
  400. *
  401. *
  402. *
  403. * `magic` (unsigned long)
  404. *
  405. * "magic" number identifying this structure as unique type.
  406. * MUST equal `S3R_MAGIC` to be valid.
  407. *
  408. * `curlhandle` (CURL)
  409. *
  410. * Pointer to the curl_easy handle generated for the request.
  411. *
  412. * `httpverb` (char *)
  413. *
  414. * Pointer to NULL-terminated string. HTTP verb,
  415. * e.g. "GET", "HEAD", "PUT", etc.
  416. *
  417. * Default is NULL, resulting in a "GET" request.
  418. *
  419. * `purl` (parsed_url_t *)
  420. *
  421. * Pointer to structure holding the elements of URL for file open.
  422. *
  423. * e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2"
  424. * parsed into...
  425. * { scheme: "http"
  426. * host: "bucket.aws.com"
  427. * port: "8080"
  428. * path: "myfile.dat"
  429. * query: "q1=v1&q2=v2"
  430. * }
  431. *
  432. * Cannot be NULL.
  433. *
  434. * `region` (char *)
  435. *
  436. * Pointer to NULL-terminated string, specifying S3 "region",
  437. * e.g., "us-east-1".
  438. *
  439. * Required to authenticate.
  440. *
  441. * `secret_id` (char *)
  442. *
  443. * Pointer to NULL-terminated string for "secret" access id to S3 resource.
  444. *
  445. * Required to authenticate.
  446. *
  447. * `signing_key` (unsigned char *)
  448. *
  449. * Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing
  450. * key, generated via
  451. * `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>",
  452. * "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")`
  453. * which may be re-used for several (up to seven (7)) days from creation?
  454. * Computed once upon file open.
  455. *
  456. * Required to authenticate.
  457. *
  458. *----------------------------------------------------------------------------
  459. */
  460. typedef struct {
  461. unsigned long magic;
  462. CURL * curlhandle;
  463. size_t filesize;
  464. char * httpverb;
  465. parsed_url_t * purl;
  466. char * region;
  467. char * secret_id;
  468. unsigned char *signing_key;
  469. } s3r_t;
  470. #define S3COMMS_S3R_MAGIC 0x44d8d79
  471. #ifdef __cplusplus
  472. extern "C" {
  473. #endif
  474. /*******************************************
  475. * DECLARATION OF HTTP FIELD LIST ROUTINES *
  476. *******************************************/
  477. H5_DLL herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L, const char *name, const char *value);
  478. /***********************************************
  479. * DECLARATION OF HTTP REQUEST BUFFER ROUTINES *
  480. ***********************************************/
  481. H5_DLL herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf);
  482. H5_DLL hrb_t *H5FD_s3comms_hrb_init_request(const char *verb, const char *resource, const char *host);
  483. /*************************************
  484. * DECLARATION OF S3REQUEST ROUTINES *
  485. *************************************/
  486. H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle);
  487. H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle);
  488. H5_DLL s3r_t *H5FD_s3comms_s3r_open(const char url[], const char region[], const char id[],
  489. const unsigned char signing_key[]);
  490. H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t *handle, haddr_t offset, size_t len, void *dest);
  491. /*********************************
  492. * DECLARATION OF OTHER ROUTINES *
  493. *********************************/
  494. H5_DLL struct tm *gmnow(void);
  495. H5_DLL herr_t H5FD_s3comms_aws_canonical_request(char *canonical_request_dest, int cr_size,
  496. char *signed_headers_dest, int sh_size, hrb_t *http_request);
  497. H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char *dest, const unsigned char *msg, size_t msg_len,
  498. hbool_t lowercase);
  499. H5_DLL herr_t H5FD_s3comms_free_purl(parsed_url_t *purl);
  500. H5_DLL herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key, size_t key_len, const char *msg,
  501. size_t msg_len, char *dest);
  502. H5_DLL herr_t H5FD_s3comms_load_aws_profile(const char *name, char *key_id_out, char *secret_access_key_out,
  503. char *aws_region_out);
  504. H5_DLL herr_t H5FD_s3comms_nlowercase(char *dest, const char *s, size_t len);
  505. H5_DLL herr_t H5FD_s3comms_parse_url(const char *str, parsed_url_t **purl);
  506. H5_DLL herr_t H5FD_s3comms_percent_encode_char(char *repr, const unsigned char c, size_t *repr_len);
  507. H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md, const char *secret, const char *region,
  508. const char *iso8601now);
  509. H5_DLL herr_t H5FD_s3comms_tostringtosign(char *dest, const char *req_str, const char *now,
  510. const char *region);
  511. H5_DLL herr_t H5FD_s3comms_trim(char *dest, char *s, size_t s_len, size_t *n_written);
  512. H5_DLL herr_t H5FD_s3comms_uriencode(char *dest, const char *s, size_t s_len, hbool_t encode_slash,
  513. size_t *n_written);
  514. #ifdef __cplusplus
  515. }
  516. #endif
  517. #endif /* H5_HAVE_ROS3_VFD */