blob: 9b6f7fb7baa2865ce3ad02e86a48496687f2d275 [file] [log] [blame]
Anthony Barbiera3adb3a2017-09-13 16:03:39 +01001/*
2 Copyright 2017 Leon Merten Lohse
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 SOFTWARE.
21*/
22
23#include <complex>
24#include <fstream>
25#include <string>
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010026#include <iostream>
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010027#include <sstream>
28#include <cstdint>
29#include <vector>
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010030#include <stdexcept>
31#include <algorithm>
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010032#include <regex>
33
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010034
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010035namespace npy {
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010036
37/* Compile-time test for byte order.
38 If your compiler does not define these per default, you may want to define
39 one of these constants manually.
40 Defaults to little endian order. */
41#if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \
42 defined(__BIG_ENDIAN__) || \
43 defined(__ARMEB__) || \
44 defined(__THUMBEB__) || \
45 defined(__AARCH64EB__) || \
46 defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
47const bool big_endian = true;
48#else
49const bool big_endian = false;
50#endif
51
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010052
53const char magic_string[] = "\x93NUMPY";
54const size_t magic_string_length = 6;
55
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010056const char little_endian_char = '<';
57const char big_endian_char = '>';
58const char no_endian_char = '|';
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010059
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010060constexpr char host_endian_char = ( big_endian ?
61 big_endian_char :
62 little_endian_char );
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010063
64inline void write_magic(std::ostream& ostream, unsigned char v_major=1, unsigned char v_minor=0) {
65 ostream.write(magic_string, magic_string_length);
66 ostream.put(v_major);
67 ostream.put(v_minor);
68}
69
70inline void read_magic(std::istream& istream, unsigned char *v_major, unsigned char *v_minor) {
71 char *buf = new char[magic_string_length+2];
72 istream.read(buf, magic_string_length+2);
73
74 if(!istream) {
75 throw std::runtime_error("io error: failed reading file");
76 }
77
78 for (size_t i=0; i < magic_string_length; i++) {
79 if(buf[i] != magic_string[i]) {
80 throw std::runtime_error("this file do not have a valid npy format.");
81 }
82 }
83
84 *v_major = buf[magic_string_length];
85 *v_minor = buf[magic_string_length+1];
86 delete[] buf;
87}
88
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010089// typestring magic
90struct Typestring {
91 private:
92 char c_endian;
93 char c_type;
94 int len;
Anthony Barbiera3adb3a2017-09-13 16:03:39 +010095
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +010096 public:
97 inline std::string str() {
98 const size_t max_buflen = 16;
99 char buf[max_buflen];
100 std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
101 return std::string(buf);
102 }
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100103
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100104 Typestring(std::vector<float>& v) :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(float)} {}
105 Typestring(std::vector<double>& v) :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(double)} {}
106 Typestring(std::vector<long double>& v) :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(long double)} {}
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100107
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100108 Typestring(std::vector<char>& v) :c_endian {no_endian_char}, c_type {'i'}, len {sizeof(char)} {}
109 Typestring(std::vector<short>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(short)} {}
110 Typestring(std::vector<int>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(int)} {}
111 Typestring(std::vector<long>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long)} {}
112 Typestring(std::vector<long long>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long long)} {}
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100113
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100114 Typestring(std::vector<unsigned char>& v) :c_endian {no_endian_char}, c_type {'u'}, len {sizeof(unsigned char)} {}
115 Typestring(std::vector<unsigned short>& v) :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned short)} {}
116 Typestring(std::vector<unsigned int>& v) :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned int)} {}
117 Typestring(std::vector<unsigned long>& v) :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long)} {}
118 Typestring(std::vector<unsigned long long>& v) :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long long)} {}
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100119
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100120 Typestring(std::vector<std::complex<float>>& v) :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<float>)} {}
121 Typestring(std::vector<std::complex<double>>& v) :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<double>)} {}
122 Typestring(std::vector<std::complex<long double>>& v) :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<long double>)} {}
123};
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100124
125inline void parse_typestring( std::string typestring){
126 std::regex re ("'([<>|])([ifuc])(\\d+)'");
127 std::smatch sm;
128
129 std::regex_match(typestring, sm, re );
130
131 if ( sm.size() != 4 ) {
132 throw std::runtime_error("invalid typestring");
133 }
134}
135
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100136/* Helpers for the improvised parser */
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100137inline std::string unwrap_s(std::string s, char delim_front, char delim_back) {
138 if ((s.back() == delim_back) && (s.front() == delim_front))
139 return s.substr(1, s.length()-2);
140 else
141 throw std::runtime_error("unable to unwrap");
142}
143
144inline std::string get_value_from_map(std::string mapstr) {
145 size_t sep_pos = mapstr.find_first_of(":");
146 if (sep_pos == std::string::npos)
147 return "";
148
149 return mapstr.substr(sep_pos+1);
150}
151
152inline void pop_char(std::string& s, char c) {
153 if (s.back() == c)
154 s.pop_back();
155}
156
157inline void ParseHeader(std::string header, std::string& descr, bool *fortran_order, std::vector<unsigned long>& shape) {
158 /*
159 The first 6 bytes are a magic string: exactly "x93NUMPY".
160
161 The next 1 byte is an unsigned byte: the major version number of the file format, e.g. x01.
162
163 The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. x00. Note: the version of the file format is not tied to the version of the numpy package.
164
165 The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN.
166
167 The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('n') and padded with spaces ('x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes.
168
169 The dictionary contains three keys:
170
171 "descr" : dtype.descr
172 An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype.
173 "fortran_order" : bool
174 Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency.
175 "shape" : tuple of int
176 The shape of the array.
177 For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order.
178 */
179
180 // remove trailing newline
181 if (header.back() != '\n')
182 throw std::runtime_error("invalid header");
183 header.pop_back();
184
185 // remove all whitespaces
186 header.erase(std::remove(header.begin(), header.end(), ' '), header.end());
187
188 // unwrap dictionary
189 header = unwrap_s(header, '{', '}');
190
191 // find the positions of the 3 dictionary keys
192 size_t keypos_descr = header.find("'descr'");
193 size_t keypos_fortran = header.find("'fortran_order'");
194 size_t keypos_shape = header.find("'shape'");
195
196 // make sure all the keys are present
197 if (keypos_descr == std::string::npos)
198 throw std::runtime_error("missing 'descr' key");
199 if (keypos_fortran == std::string::npos)
200 throw std::runtime_error("missing 'fortran_order' key");
201 if (keypos_shape == std::string::npos)
202 throw std::runtime_error("missing 'shape' key");
203
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100204 // Make sure the keys are in order.
205 // Note that this violates the standard, which states that readers *must* not
206 // depend on the correct order here.
207 // TODO: fix
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100208 if (keypos_descr >= keypos_fortran || keypos_fortran >= keypos_shape)
209 throw std::runtime_error("header keys in wrong order");
210
211 // get the 3 key-value pairs
212 std::string keyvalue_descr;
213 keyvalue_descr = header.substr(keypos_descr, keypos_fortran - keypos_descr);
214 pop_char(keyvalue_descr, ',');
215
216 std::string keyvalue_fortran;
217 keyvalue_fortran = header.substr(keypos_fortran, keypos_shape - keypos_fortran);
218 pop_char(keyvalue_fortran, ',');
219
220 std::string keyvalue_shape;
221 keyvalue_shape = header.substr(keypos_shape, std::string::npos);
222 pop_char(keyvalue_shape, ',');
223
224 // get the values (right side of `:')
225 std::string descr_s = get_value_from_map(keyvalue_descr);
226 std::string fortran_s = get_value_from_map(keyvalue_fortran);
227 std::string shape_s = get_value_from_map(keyvalue_shape);
228
229 parse_typestring(descr_s);
230 descr = unwrap_s(descr_s, '\'', '\'');
231
232 // convert literal Python bool to C++ bool
233 if (fortran_s == "True")
234 *fortran_order = true;
235 else if (fortran_s == "False")
236 *fortran_order = false;
237 else
238 throw std::runtime_error("invalid fortran_order value");
239
240 // parse the shape Python tuple ( x, y, z,)
241
242 // first clear the vector
243 shape.clear();
244 shape_s = unwrap_s(shape_s, '(', ')');
245
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100246 // a tokenizer would be nice...
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100247 size_t pos = 0;
248 size_t pos_next;
249 for(;;) {
250 pos_next = shape_s.find_first_of(',', pos);
251 std::string dim_s;
252 if (pos_next != std::string::npos)
253 dim_s = shape_s.substr(pos, pos_next - pos);
254 else
255 dim_s = shape_s.substr(pos);
256 pop_char(dim_s, ',');
257 if (dim_s.length() == 0) {
258 if (pos_next != std::string::npos)
259 throw std::runtime_error("invalid shape");
260 }else{
261 std::stringstream ss;
262 ss << dim_s;
263 unsigned long tmp;
264 ss >> tmp;
265 shape.push_back(tmp);
266 }
267 if (pos_next != std::string::npos)
268 pos = ++pos_next;
269 else
270 break;
271 }
272}
273
274inline void WriteHeader(std::ostream& out, const std::string& descr, bool fortran_order, unsigned int n_dims, const unsigned long shape[])
275{
276 std::ostringstream ss_header;
277 std::string s_fortran_order;
278 if (fortran_order)
279 s_fortran_order = "True";
280 else
281 s_fortran_order = "False";
282
283 std::ostringstream ss_shape;
284 ss_shape << "(";
285 for (unsigned int n=0; n < n_dims; n++){
286 ss_shape << shape[n] << ", ";
287 }
288 ss_shape << ")";
289
290 ss_header << "{'descr': '" << descr << "', 'fortran_order': " << s_fortran_order << ", 'shape': " << ss_shape.str() << " }";
291
292 size_t header_len_pre = ss_header.str().length() + 1;
293 size_t metadata_len = magic_string_length + 2 + 2 + header_len_pre;
294
295 unsigned char version[2] = {1, 0};
296 if (metadata_len >= 255*255) {
297 metadata_len = magic_string_length + 2 + 4 + header_len_pre;
298 version[0] = 2;
299 version[1] = 0;
300 }
301 size_t padding_len = 16 - metadata_len % 16;
302 std::string padding (padding_len, ' ');
303 ss_header << padding;
304 ss_header << '\n';
305
306 std::string header = ss_header.str();
307
308 // write magic
309 write_magic(out, version[0], version[1]);
310
311 // write header length
312 if (version[0] == 1 && version[1] == 0) {
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100313 char header_len_le16[2];
314 uint16_t header_len = header.length();
315
316 header_len_le16[0] = (header_len >> 0) & 0xff;
317 header_len_le16[1] = (header_len >> 8) & 0xff;
318 out.write(reinterpret_cast<char *>(header_len_le16), 2);
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100319 }else{
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100320 char header_len_le32[4];
321 uint32_t header_len = header.length();
322
323 header_len_le32[0] = (header_len >> 0) & 0xff;
324 header_len_le32[1] = (header_len >> 8) & 0xff;
325 header_len_le32[2] = (header_len >> 16) & 0xff;
326 header_len_le32[3] = (header_len >> 24) & 0xff;
327 out.write(reinterpret_cast<char *>(header_len_le32), 4);
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100328 }
329
330 out << header;
331}
332
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100333inline std::string read_header_1_0(std::istream& istream) {
334 // read header length and convert from little endian
335 char header_len_le16[2];
336 istream.read(header_len_le16, 2);
337
338 uint16_t header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
339
340 if((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
341 // TODO: display warning
342 }
343
344 char *buf = new char[header_length];
345 istream.read(buf, header_length);
346 std::string header (buf, header_length);
347 delete[] buf;
348
349 return header;
350}
351
352inline std::string read_header_2_0(std::istream& istream) {
353 // read header length and convert from little endian
354 char header_len_le32[4];
355 istream.read(header_len_le32, 4);
356
357 uint32_t header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8)
358 | (header_len_le32[2] << 16) | (header_len_le32[3] << 24);
359
360 if((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
361 // TODO: display warning
362 }
363
364 char *buf = new char[header_length];
365 istream.read(buf, header_length);
366 std::string header (buf, header_length);
367 delete[] buf;
368
369 return header;
370}
371
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100372template<typename Scalar>
373void SaveArrayAsNumpy( const std::string& filename, bool fortran_order, unsigned int n_dims, const unsigned long shape[], const std::vector<Scalar>& data)
374{
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100375 Typestring typestring_o {data};
376 std::string typestring = typestring_o.str();
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100377
378 std::ofstream stream( filename, std::ofstream::binary);
379 if(!stream) {
380 throw std::runtime_error("io error: failed to open a file.");
381 }
382 WriteHeader(stream, typestring, fortran_order, n_dims, shape);
383
384 size_t size = 1;
385 for (unsigned int i=0; i<n_dims; ++i)
386 size *= shape[i];
387 stream.write(reinterpret_cast<const char*>(&data[0]), sizeof(Scalar) * size);
388}
389
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100390
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100391/**
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100392
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100393 */
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100394template<typename Scalar>
395void LoadArrayFromNumpy(const std::string& filename, std::vector<unsigned long>& shape, std::vector<Scalar>& data)
396{
397 std::ifstream stream(filename, std::ifstream::binary);
398 if(!stream) {
399 throw std::runtime_error("io error: failed to open a file.");
400 }
401 // check magic bytes an version number
402 unsigned char v_major, v_minor;
403 read_magic(stream, &v_major, &v_minor);
404
405 std::string header;
406
407 if(v_major == 1 && v_minor == 0){
408 header = read_header_1_0(stream);
409 }else if(v_major == 2 && v_minor == 0) {
410 header = read_header_2_0(stream);
411 }else{
412 throw std::runtime_error("unsupported file format version");
413 }
414
415 // parse header
416 bool fortran_order;
417 std::string typestr;
418
419 ParseHeader(header, typestr, &fortran_order, shape);
420
421 // check if the typestring matches the given one
Anthony Barbier3c5b4ff2017-10-12 13:20:52 +0100422 Typestring typestring_o {data};
423 std::string expect_typestr = typestring_o.str();
Anthony Barbiera3adb3a2017-09-13 16:03:39 +0100424 if (typestr != expect_typestr) {
425 throw std::runtime_error("formatting error: typestrings not matching");
426 }
427
428 // compute the data size based on the shape
429 size_t total_size = 1;
430 for(size_t i=0; i<shape.size(); ++i) {
431 total_size *= shape[i];
432 }
433 data.resize(total_size);
434
435 // read the data
436 stream.read(reinterpret_cast<char*>(&data[0]), sizeof(Scalar)*total_size);
437}
438
439} // namespace npy