|
| 1 | +/*************************************************************************** |
| 2 | +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
| 3 | +* Martin Renou * |
| 4 | +* Copyright (c) QuantStack * |
| 5 | +* * |
| 6 | +* Distributed under the terms of the BSD 3-Clause License. * |
| 7 | +* * |
| 8 | +* The full license is in the file LICENSE, distributed with this software. * |
| 9 | +****************************************************************************/ |
| 10 | + |
| 11 | +#ifndef XFRAME_IO_SAS_HPP |
| 12 | +#define XFRAME_IO_SAS_HPP |
| 13 | + |
| 14 | +#include <string> |
| 15 | +#include <fstream> |
| 16 | + |
| 17 | +#include <xtl/xany.hpp> |
| 18 | + |
| 19 | +#include "xvariable.hpp" |
| 20 | + |
| 21 | +namespace xf |
| 22 | +{ |
| 23 | + enum class sas_format |
| 24 | + { |
| 25 | + sas7bdata, |
| 26 | + xport |
| 27 | + }; |
| 28 | + inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata); |
| 29 | + |
| 30 | + namespace detail |
| 31 | + { |
| 32 | + constexpr uint8_t sas_endian_big = 0x00; |
| 33 | + constexpr uint8_t sas_endian_little = 0x01; |
| 34 | + |
| 35 | + constexpr char sas_file_format_unix = '1'; |
| 36 | + constexpr char sas_file_format_windows = '2'; |
| 37 | + |
| 38 | + constexpr uint8_t sas_aligment_offset_0 = 0x22; |
| 39 | + constexpr uint8_t sas_aligment_offset_4 = 0x33; |
| 40 | + |
| 41 | + constexpr uint8_t sas_column_type_number = 0x01; |
| 42 | + constexpr uint8_t sas_column_type_char = 0x02; |
| 43 | + |
| 44 | + constexpr uint32_t sas_subheader_signature_row_size = 0xF7F7F7F7; |
| 45 | + constexpr uint32_t sas_subheader_signature_column_size = 0xF6F6F6F6; |
| 46 | + constexpr uint32_t sas_subheader_signature_counts = 0xFFFFFC00; |
| 47 | + constexpr uint32_t sas_subheader_signature_column_format = 0xFFFFFBFE; |
| 48 | + |
| 49 | + constexpr uint32_t sas_subheader_signature_column_attrs = 0xFFFFFFFC; |
| 50 | + constexpr uint32_t sas_subheader_signature_column_text = 0xFFFFFFFD; |
| 51 | + constexpr uint32_t sas_subheader_signature_column_list = 0xFFFFFFFE; |
| 52 | + constexpr uint32_t sas_subheader_signature_column_name = 0xFFFFFFFF; |
| 53 | + |
| 54 | + constexpr uint16_t sas_page_type_meta = 0x0000; |
| 55 | + constexpr uint16_t sas_page_type_data = 0x0100; |
| 56 | + constexpr uint16_t sas_page_type_mix = 0x0200; |
| 57 | + constexpr uint16_t sas_page_type_amd = 0x0400; |
| 58 | + constexpr uint16_t sas_page_type_mask = 0x0F00; |
| 59 | + |
| 60 | + constexpr uint16_t sas_page_type_meta2 = 0x4000; |
| 61 | + constexpr uint16_t sas_page_type_comp = 0x9000; |
| 62 | + |
| 63 | + constexpr uint64_t sas_subheader_pointer_size_32bit = 12; |
| 64 | + constexpr uint64_t sas_subheader_pointer_size_64bit = 24; |
| 65 | + |
| 66 | + constexpr uint64_t sas_page_header_size_32bit = 24; |
| 67 | + constexpr uint64_t sas_page_header_size_64bit = 40; |
| 68 | + |
| 69 | + constexpr uint8_t sas_compression_none = 0x00; |
| 70 | + constexpr uint8_t sas_compression_trunc = 0x01; |
| 71 | + constexpr uint8_t sas_compression_row = 0x04; |
| 72 | + |
| 73 | + constexpr uint64_t sas_default_file_version = 9; |
| 74 | + |
| 75 | + constexpr unsigned char sas7bdat_magic_number[32] = { |
| 76 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 77 | + 0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x60, |
| 78 | + 0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00, |
| 79 | + 0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11 |
| 80 | + }; |
| 81 | + |
| 82 | + constexpr unsigned char sas7bcat_magic_number[32] = { |
| 83 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 84 | + 0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x63, |
| 85 | + 0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00, |
| 86 | + 0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11 |
| 87 | + }; |
| 88 | + |
| 89 | + class xsas7bdat_reader |
| 90 | + { |
| 91 | + public: |
| 92 | + explicit xsas7bdat_reader(std::ifstream& ifs); |
| 93 | + |
| 94 | + xsas7bdat_reader(xsas7bdat_reader&) = delete; |
| 95 | + xsas7bdat_reader(xsas7bdat_reader&&) = delete; |
| 96 | + xsas7bdat_reader& operator=(const xsas7bdat_reader&) = delete; |
| 97 | + xsas7bdat_reader& operator=(xsas7bdat_reader&&) = delete; |
| 98 | + std::vector<std::string> parse_meta(); |
| 99 | + std::vector<std::string> parse_data(); |
| 100 | + |
| 101 | + private: |
| 102 | + inline bool little_endian() |
| 103 | + { |
| 104 | + const int value { 0x01 }; |
| 105 | + const void * address = static_cast<const void *>(&value); |
| 106 | + const unsigned char * least_significant_address = static_cast<const unsigned char *>(address); |
| 107 | + return (*least_significant_address == 0x01); |
| 108 | + } |
| 109 | + template <typename T> |
| 110 | + inline auto swap_endian(const T&) -> T; |
| 111 | + template <typename T> |
| 112 | + inline auto read_data(const std::ifstream& ,bool) -> T; |
| 113 | + |
| 114 | + inline void parse_head(); |
| 115 | + inline void parse_page(std::string::iterator& it); |
| 116 | + inline void parse_subheader(std::string::iterator& it, uint16_t subheader_pointers_count); |
| 117 | + |
| 118 | + std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> parse_subheader_pointer(std::string::iterator& it); |
| 119 | + |
| 120 | + uint64_t m_page_count {0}; |
| 121 | + uint64_t m_header_size {0}; |
| 122 | + uint64_t m_page_size {0}; |
| 123 | + uint64_t m_max_row_count{0}; |
| 124 | + uint64_t m_row_length{0}; |
| 125 | + uint64_t m_total_row_count; |
| 126 | + uint64_t m_col_count; |
| 127 | + bool m_u64 {false}; |
| 128 | + bool m_swap {false}; |
| 129 | + std::ifstream& m_sas_ifs; |
| 130 | + }; |
| 131 | + |
| 132 | + xsas7bdat_reader::xsas7bdat_reader(std::ifstream& ifs) : m_sas_ifs(ifs) |
| 133 | + { |
| 134 | + } |
| 135 | + |
| 136 | + template <typename T> |
| 137 | + inline auto swap_endian(const T &val) -> T |
| 138 | + { |
| 139 | + union |
| 140 | + { |
| 141 | + T val; |
| 142 | + std::array<std::uint8_t, sizeof(T)> raw; |
| 143 | + } src, dst; |
| 144 | + src.val = val; |
| 145 | + std::reverse_copy(src.raw.begin(), src.raw.end(), dst.raw.begin()); |
| 146 | + return dst.val; |
| 147 | + } |
| 148 | + |
| 149 | + template <typename T> |
| 150 | + inline auto read_sas_data(std::ifstream& ifs ,bool swap) -> T |
| 151 | + { |
| 152 | + T data; |
| 153 | + if (!ifs.read((char*)(&data), sizeof(data))) |
| 154 | + throw std::runtime_error(""); |
| 155 | + if (swap) |
| 156 | + data = swap_endian(data); |
| 157 | + return data; |
| 158 | + } |
| 159 | + |
| 160 | + template <typename T> |
| 161 | + inline auto read_sas_data(std::string::iterator& memory_it, bool swap) -> T |
| 162 | + { |
| 163 | + T data; |
| 164 | + std::string str(memory_it, memory_it + sizeof(T)); |
| 165 | + str.copy((char*)&data, sizeof(T), 0); |
| 166 | + if (swap) |
| 167 | + data = swap_endian(data); |
| 168 | + memory_it += sizeof(T); |
| 169 | + return data; |
| 170 | + } |
| 171 | + std::vector<std::string> xsas7bdat_reader::parse_meta() |
| 172 | + { |
| 173 | + parse_head(); |
| 174 | + auto page_header_size = m_u64 ? sas_page_header_size_64bit : sas_page_header_size_32bit; |
| 175 | + auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit; |
| 176 | + auto subheader_signature_size = m_u64 ? 8 : 4; |
| 177 | + |
| 178 | + for (decltype(m_page_count) idx = 0; idx < m_page_count; idx++) |
| 179 | + { |
| 180 | + auto page_offset = m_header_size + idx * m_page_size; |
| 181 | + if (!m_sas_ifs.seekg(page_offset, m_sas_ifs.beg)) |
| 182 | + throw std::runtime_error("parse sas error"); |
| 183 | + |
| 184 | + std::string page_memory; |
| 185 | + if (!m_sas_ifs.read(&page_memory[0], m_page_size)) |
| 186 | + throw std::runtime_error("parse sas error"); |
| 187 | + |
| 188 | + parse_page(page_memory); |
| 189 | + } |
| 190 | + |
| 191 | + } |
| 192 | +// template <typename T> |
| 193 | +// inline void seekg(std::ifstream& ifs, std::ios_base::seekdir& seekdir) |
| 194 | +// { |
| 195 | +// ifs.seekg(sizeof(T), seekdir); |
| 196 | +// return; |
| 197 | +// } |
| 198 | + |
| 199 | +// inline void seekg(std::ifstream& ifs, size_t pos, std::ios_base::seekdir& seekdir) |
| 200 | +// { |
| 201 | +// ifs.seekg(pos, seekdir); |
| 202 | +// return; |
| 203 | +// } |
| 204 | + |
| 205 | +// template <typename T> |
| 206 | +// inline void seekg(std::string::iterator& memory_it) |
| 207 | +// { |
| 208 | +// memory_it += sizeof(T); |
| 209 | +// return; |
| 210 | +// } |
| 211 | + |
| 212 | + inline void seekg(std::string::iterator& memory_it, size_t pos) |
| 213 | + { |
| 214 | + memory_it += pos; |
| 215 | + return; |
| 216 | + } |
| 217 | + |
| 218 | + void xsas7bdat_reader::parse_head() |
| 219 | + { |
| 220 | + #pragma pack(push, 1) |
| 221 | + struct sas_header_begin |
| 222 | + { |
| 223 | + unsigned char magic_number[32]; |
| 224 | + unsigned char a2; |
| 225 | + unsigned char mystery1[2]; |
| 226 | + unsigned char a1; |
| 227 | + unsigned char mystery2[1]; |
| 228 | + unsigned char endian; |
| 229 | + unsigned char mystery3[1]; |
| 230 | + char file_format; |
| 231 | + unsigned char mystery4[30]; |
| 232 | + unsigned char encoding; |
| 233 | + unsigned char mystery5[13]; |
| 234 | + char file_type[8]; |
| 235 | + char file_label[64]; |
| 236 | + char file_info[8]; |
| 237 | + }; |
| 238 | + #pragma pack(pop) |
| 239 | + |
| 240 | + auto header_begin = read_sas_data<sas_header_begin>(m_sas_ifs, false); |
| 241 | + if (std::memcmp(header_begin.magic_number, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0) |
| 242 | + throw std::runtime_error("error"); |
| 243 | + auto a1 = 0; |
| 244 | + if (header_begin.a1 == sas_aligment_offset_4) |
| 245 | + a1 = 4; |
| 246 | + if (header_begin.a2 == sas_aligment_offset_4) |
| 247 | + m_u64 = true; |
| 248 | + m_swap = false; |
| 249 | + if (header_begin.endian == sas_endian_big) |
| 250 | + m_swap = little_endian(); |
| 251 | + else if (header_begin.endian == sas_endian_little) |
| 252 | + m_swap = !little_endian(); |
| 253 | + else |
| 254 | + throw std::runtime_error("parse sas error"); |
| 255 | + if (!m_sas_ifs.seekg(a1 + sizeof(double) * 2 + 16, m_sas_ifs.cur)) |
| 256 | + throw std::runtime_error("parse sas error"); |
| 257 | + |
| 258 | + m_header_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap); |
| 259 | + m_page_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap); |
| 260 | + if (m_header_size < 1024 || m_page_size < 1024) |
| 261 | + throw std::runtime_error(""); |
| 262 | + if (m_header_size > (1 << 20) || m_page_size > (1 << 24)) |
| 263 | + throw std::runtime_error(""); |
| 264 | + |
| 265 | + if (m_u64) |
| 266 | + m_page_count = read_sas_data<uint64_t>(m_sas_ifs, m_swap); |
| 267 | + else |
| 268 | + m_page_count = read_sas_data<uint32_t>(m_sas_ifs, m_swap); |
| 269 | + if (m_page_count > (1 << 24)) |
| 270 | + throw std::runtime_error(""); |
| 271 | + } |
| 272 | + |
| 273 | + void xsas7bdat_reader::parse_page(std::string& page_memory) |
| 274 | + { |
| 275 | + auto it = page_memory.begin(); |
| 276 | + auto signature = read_sas_data<uint32_t>(it, m_swap); |
| 277 | + m_u64 ? seekg(it, 12) : seekg(it, 28); |
| 278 | + |
| 279 | + auto page_type = read_sas_data<uint16_t>(it, m_swap); |
| 280 | + //TODO |
| 281 | +// if ((page_type & sas_page_type_mask) == sas_page_type_data) |
| 282 | +// break; |
| 283 | +// if ((page_type & sas_page_type_comp)) |
| 284 | +// continue; |
| 285 | + auto data_block_count = read_sas_data<uint16_t>(it, m_swap); |
| 286 | + auto subheader_pointers_count = read_sas_data<uint16_t>(it, m_swap); |
| 287 | + seekg(it, 2); |
| 288 | + auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit; |
| 289 | + for (auto idx = 0; idx < subheader_pointers_count; idx++) |
| 290 | + { |
| 291 | + auto it = it + subheader_pointer_size; |
| 292 | + auto ret = parse_subheader_pointer(it); |
| 293 | + auto subheader_it = page_memory.begin() + std::get<0>(ret); |
| 294 | + if (signature == sas_subheader_signature_row_size) { |
| 295 | + if (std::get<1>(ret) < (m_u64 ? 128 : 64)) |
| 296 | + throw std::runtime_error(""); |
| 297 | + parse_row_size_subheader(subheader_it); |
| 298 | + } else if (signature == sas_subheader_signature_column_size) { |
| 299 | + if (std::get<1>(ret) < (m_u64 ? 16 : 8)) |
| 300 | + throw std::runtime_error(""); |
| 301 | + parse_column_size_subheader(subheader_it); |
| 302 | + } else if (signature == sas_subheader_signature_counts) { |
| 303 | + /* void */ |
| 304 | + } else if (signature == sas_subheader_signature_column_text) { |
| 305 | + parse_column_text_subheader(subheader_it); |
| 306 | + } else if (signature == sas_subheader_signature_column_name) { |
| 307 | + parse_column_name_subheader(subheader_it, std::get<1>(ret)); |
| 308 | + } else if (signature == sas_subheader_signature_column_attrs) { |
| 309 | + parse_column_attributes_subheader(subheader_it); |
| 310 | + } else if (signature == sas_subheader_signature_column_format) { |
| 311 | + parse_column_format_subheader(subheader_it); |
| 312 | + } else if (signature == sas_subheader_signature_column_list) { |
| 313 | + /* void */ |
| 314 | + } else if ((signature & sas_subheader_signature_column_mask) == sas_subheader_signature_column_mask) { |
| 315 | + /* void */ |
| 316 | + } else { |
| 317 | + /* void */ |
| 318 | + } |
| 319 | + } |
| 320 | + } |
| 321 | + |
| 322 | + std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> xsas7bdat_reader::parse_subheader_pointer() |
| 323 | + { |
| 324 | + auto offset_to_subhead = 0; |
| 325 | + auto length = 0; |
| 326 | + auto compression = 0; |
| 327 | + auto subheader_type = 0; |
| 328 | + if (m_u64) |
| 329 | + { |
| 330 | + offset_to_subhead = read_sas_data<uint64_t>(it, m_swap); |
| 331 | + length = read_sas_data<uint64_t>(it, m_swap); |
| 332 | + compression = read_sas_data<uint8_t>(it, m_swap); |
| 333 | + subheader_type = read_sas_data<uint8_t>(it, m_swap); |
| 334 | + seekg(it, 7); |
| 335 | + } |
| 336 | + else |
| 337 | + { |
| 338 | + offset_to_subhead = read_sas_data<uint32_t>(it, m_swap); |
| 339 | + length = read_sas_data<uint32_t>(it, m_swap); |
| 340 | + compression = read_sas_data<uint8_t>(it, m_swap); |
| 341 | + subheader_type = read_sas_data<uint8_t>(it, m_swap); |
| 342 | + seekg(it, 3); |
| 343 | + } |
| 344 | + return std::make_tuple(offset_to_subhead, length, compression, subheader_type); |
| 345 | + } |
| 346 | + |
| 347 | + void xsas7bdat_reader::parse_row_size_subheader(std::string::iterator& it) |
| 348 | + { |
| 349 | + m_u64 ? seekg(it, 40) : seekg(it, 20); |
| 350 | + m_row_length = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); |
| 351 | + m_total_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); |
| 352 | + m_u64 ? seekg(it, 72) : seekg(it, 36); |
| 353 | + m_max_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); |
| 354 | + } |
| 355 | + |
| 356 | + void xsas7bdat_reader::parse_column_size_subheader(std::string::iterator& it) |
| 357 | + { |
| 358 | + m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); |
| 359 | + } |
| 360 | + void xsas7bdat_reader::parse_column_text_subheader(std::string::iterator& it) |
| 361 | + { |
| 362 | + m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap); |
| 363 | + } |
| 364 | + |
| 365 | + void xsas7bdat_reader::parse_column_name_subheader(std::string::iterator& it, uint64_t length) |
| 366 | + { |
| 367 | + int cmax = m_u64 ? (length - 28) / 8 : (length - 20) / 8; |
| 368 | + readstat_error_t retval = READSTAT_OK; |
| 369 | + } |
| 370 | + |
| 371 | +// void xsas7bdat_reader::parse_column_attrs_subheader() |
| 372 | +// { |
| 373 | +// } |
| 374 | + |
| 375 | +// void xsas7bdat_reader::parse_column_format_subheader() |
| 376 | +// { |
| 377 | + |
| 378 | +// } |
| 379 | + |
| 380 | + |
| 381 | + } |
| 382 | +// inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> xf::read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata) |
| 383 | +// { |
| 384 | +// if (format == sas_format::sas7bdata) |
| 385 | +// { |
| 386 | +// detail::xsas7bdat_reader reader(ifs); |
| 387 | +// return reader.parse(); |
| 388 | +// } |
| 389 | +// else |
| 390 | +// { |
| 391 | + |
| 392 | +// } |
| 393 | +// return xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>(); |
| 394 | +// } |
| 395 | +} |
| 396 | + |
| 397 | +#endif |
0 commit comments