Skip to content

Commit f769bd6

Browse files
committed
add: parse sas file
1 parent ceb346b commit f769bd6

File tree

1 file changed

+397
-0
lines changed

1 file changed

+397
-0
lines changed

include/xframe/xio_sas.hpp

+397
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* *
6+
* Distributed under the terms of the BSD 3-Clause License. *
7+
* *
8+
* The full license is in the file LICENSE, distributed with this software. *
9+
****************************************************************************/
10+
11+
#ifndef XFRAME_IO_SAS_HPP
12+
#define XFRAME_IO_SAS_HPP
13+
14+
#include <string>
15+
#include <fstream>
16+
17+
#include <xtl/xany.hpp>
18+
19+
#include "xvariable.hpp"
20+
21+
namespace xf
22+
{
23+
enum class sas_format
24+
{
25+
sas7bdata,
26+
xport
27+
};
28+
inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata);
29+
30+
namespace detail
31+
{
32+
constexpr uint8_t sas_endian_big = 0x00;
33+
constexpr uint8_t sas_endian_little = 0x01;
34+
35+
constexpr char sas_file_format_unix = '1';
36+
constexpr char sas_file_format_windows = '2';
37+
38+
constexpr uint8_t sas_aligment_offset_0 = 0x22;
39+
constexpr uint8_t sas_aligment_offset_4 = 0x33;
40+
41+
constexpr uint8_t sas_column_type_number = 0x01;
42+
constexpr uint8_t sas_column_type_char = 0x02;
43+
44+
constexpr uint32_t sas_subheader_signature_row_size = 0xF7F7F7F7;
45+
constexpr uint32_t sas_subheader_signature_column_size = 0xF6F6F6F6;
46+
constexpr uint32_t sas_subheader_signature_counts = 0xFFFFFC00;
47+
constexpr uint32_t sas_subheader_signature_column_format = 0xFFFFFBFE;
48+
49+
constexpr uint32_t sas_subheader_signature_column_attrs = 0xFFFFFFFC;
50+
constexpr uint32_t sas_subheader_signature_column_text = 0xFFFFFFFD;
51+
constexpr uint32_t sas_subheader_signature_column_list = 0xFFFFFFFE;
52+
constexpr uint32_t sas_subheader_signature_column_name = 0xFFFFFFFF;
53+
54+
constexpr uint16_t sas_page_type_meta = 0x0000;
55+
constexpr uint16_t sas_page_type_data = 0x0100;
56+
constexpr uint16_t sas_page_type_mix = 0x0200;
57+
constexpr uint16_t sas_page_type_amd = 0x0400;
58+
constexpr uint16_t sas_page_type_mask = 0x0F00;
59+
60+
constexpr uint16_t sas_page_type_meta2 = 0x4000;
61+
constexpr uint16_t sas_page_type_comp = 0x9000;
62+
63+
constexpr uint64_t sas_subheader_pointer_size_32bit = 12;
64+
constexpr uint64_t sas_subheader_pointer_size_64bit = 24;
65+
66+
constexpr uint64_t sas_page_header_size_32bit = 24;
67+
constexpr uint64_t sas_page_header_size_64bit = 40;
68+
69+
constexpr uint8_t sas_compression_none = 0x00;
70+
constexpr uint8_t sas_compression_trunc = 0x01;
71+
constexpr uint8_t sas_compression_row = 0x04;
72+
73+
constexpr uint64_t sas_default_file_version = 9;
74+
75+
constexpr unsigned char sas7bdat_magic_number[32] = {
76+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
77+
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x60,
78+
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
79+
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
80+
};
81+
82+
constexpr unsigned char sas7bcat_magic_number[32] = {
83+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
84+
0x00, 0x00, 0x00, 0x00, 0xc2, 0xea, 0x81, 0x63,
85+
0xb3, 0x14, 0x11, 0xcf, 0xbd, 0x92, 0x08, 0x00,
86+
0x09, 0xc7, 0x31, 0x8c, 0x18, 0x1f, 0x10, 0x11
87+
};
88+
89+
class xsas7bdat_reader
90+
{
91+
public:
92+
explicit xsas7bdat_reader(std::ifstream& ifs);
93+
94+
xsas7bdat_reader(xsas7bdat_reader&) = delete;
95+
xsas7bdat_reader(xsas7bdat_reader&&) = delete;
96+
xsas7bdat_reader& operator=(const xsas7bdat_reader&) = delete;
97+
xsas7bdat_reader& operator=(xsas7bdat_reader&&) = delete;
98+
std::vector<std::string> parse_meta();
99+
std::vector<std::string> parse_data();
100+
101+
private:
102+
inline bool little_endian()
103+
{
104+
const int value { 0x01 };
105+
const void * address = static_cast<const void *>(&value);
106+
const unsigned char * least_significant_address = static_cast<const unsigned char *>(address);
107+
return (*least_significant_address == 0x01);
108+
}
109+
template <typename T>
110+
inline auto swap_endian(const T&) -> T;
111+
template <typename T>
112+
inline auto read_data(const std::ifstream& ,bool) -> T;
113+
114+
inline void parse_head();
115+
inline void parse_page(std::string::iterator& it);
116+
inline void parse_subheader(std::string::iterator& it, uint16_t subheader_pointers_count);
117+
118+
std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> parse_subheader_pointer(std::string::iterator& it);
119+
120+
uint64_t m_page_count {0};
121+
uint64_t m_header_size {0};
122+
uint64_t m_page_size {0};
123+
uint64_t m_max_row_count{0};
124+
uint64_t m_row_length{0};
125+
uint64_t m_total_row_count;
126+
uint64_t m_col_count;
127+
bool m_u64 {false};
128+
bool m_swap {false};
129+
std::ifstream& m_sas_ifs;
130+
};
131+
132+
xsas7bdat_reader::xsas7bdat_reader(std::ifstream& ifs) : m_sas_ifs(ifs)
133+
{
134+
}
135+
136+
template <typename T>
137+
inline auto swap_endian(const T &val) -> T
138+
{
139+
union
140+
{
141+
T val;
142+
std::array<std::uint8_t, sizeof(T)> raw;
143+
} src, dst;
144+
src.val = val;
145+
std::reverse_copy(src.raw.begin(), src.raw.end(), dst.raw.begin());
146+
return dst.val;
147+
}
148+
149+
template <typename T>
150+
inline auto read_sas_data(std::ifstream& ifs ,bool swap) -> T
151+
{
152+
T data;
153+
if (!ifs.read((char*)(&data), sizeof(data)))
154+
throw std::runtime_error("");
155+
if (swap)
156+
data = swap_endian(data);
157+
return data;
158+
}
159+
160+
template <typename T>
161+
inline auto read_sas_data(std::string::iterator& memory_it, bool swap) -> T
162+
{
163+
T data;
164+
std::string str(memory_it, memory_it + sizeof(T));
165+
str.copy((char*)&data, sizeof(T), 0);
166+
if (swap)
167+
data = swap_endian(data);
168+
memory_it += sizeof(T);
169+
return data;
170+
}
171+
std::vector<std::string> xsas7bdat_reader::parse_meta()
172+
{
173+
parse_head();
174+
auto page_header_size = m_u64 ? sas_page_header_size_64bit : sas_page_header_size_32bit;
175+
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit;
176+
auto subheader_signature_size = m_u64 ? 8 : 4;
177+
178+
for (decltype(m_page_count) idx = 0; idx < m_page_count; idx++)
179+
{
180+
auto page_offset = m_header_size + idx * m_page_size;
181+
if (!m_sas_ifs.seekg(page_offset, m_sas_ifs.beg))
182+
throw std::runtime_error("parse sas error");
183+
184+
std::string page_memory;
185+
if (!m_sas_ifs.read(&page_memory[0], m_page_size))
186+
throw std::runtime_error("parse sas error");
187+
188+
parse_page(page_memory);
189+
}
190+
191+
}
192+
// template <typename T>
193+
// inline void seekg(std::ifstream& ifs, std::ios_base::seekdir& seekdir)
194+
// {
195+
// ifs.seekg(sizeof(T), seekdir);
196+
// return;
197+
// }
198+
199+
// inline void seekg(std::ifstream& ifs, size_t pos, std::ios_base::seekdir& seekdir)
200+
// {
201+
// ifs.seekg(pos, seekdir);
202+
// return;
203+
// }
204+
205+
// template <typename T>
206+
// inline void seekg(std::string::iterator& memory_it)
207+
// {
208+
// memory_it += sizeof(T);
209+
// return;
210+
// }
211+
212+
inline void seekg(std::string::iterator& memory_it, size_t pos)
213+
{
214+
memory_it += pos;
215+
return;
216+
}
217+
218+
void xsas7bdat_reader::parse_head()
219+
{
220+
#pragma pack(push, 1)
221+
struct sas_header_begin
222+
{
223+
unsigned char magic_number[32];
224+
unsigned char a2;
225+
unsigned char mystery1[2];
226+
unsigned char a1;
227+
unsigned char mystery2[1];
228+
unsigned char endian;
229+
unsigned char mystery3[1];
230+
char file_format;
231+
unsigned char mystery4[30];
232+
unsigned char encoding;
233+
unsigned char mystery5[13];
234+
char file_type[8];
235+
char file_label[64];
236+
char file_info[8];
237+
};
238+
#pragma pack(pop)
239+
240+
auto header_begin = read_sas_data<sas_header_begin>(m_sas_ifs, false);
241+
if (std::memcmp(header_begin.magic_number, sas7bdat_magic_number, sizeof(sas7bdat_magic_number)) != 0)
242+
throw std::runtime_error("error");
243+
auto a1 = 0;
244+
if (header_begin.a1 == sas_aligment_offset_4)
245+
a1 = 4;
246+
if (header_begin.a2 == sas_aligment_offset_4)
247+
m_u64 = true;
248+
m_swap = false;
249+
if (header_begin.endian == sas_endian_big)
250+
m_swap = little_endian();
251+
else if (header_begin.endian == sas_endian_little)
252+
m_swap = !little_endian();
253+
else
254+
throw std::runtime_error("parse sas error");
255+
if (!m_sas_ifs.seekg(a1 + sizeof(double) * 2 + 16, m_sas_ifs.cur))
256+
throw std::runtime_error("parse sas error");
257+
258+
m_header_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
259+
m_page_size = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
260+
if (m_header_size < 1024 || m_page_size < 1024)
261+
throw std::runtime_error("");
262+
if (m_header_size > (1 << 20) || m_page_size > (1 << 24))
263+
throw std::runtime_error("");
264+
265+
if (m_u64)
266+
m_page_count = read_sas_data<uint64_t>(m_sas_ifs, m_swap);
267+
else
268+
m_page_count = read_sas_data<uint32_t>(m_sas_ifs, m_swap);
269+
if (m_page_count > (1 << 24))
270+
throw std::runtime_error("");
271+
}
272+
273+
void xsas7bdat_reader::parse_page(std::string& page_memory)
274+
{
275+
auto it = page_memory.begin();
276+
auto signature = read_sas_data<uint32_t>(it, m_swap);
277+
m_u64 ? seekg(it, 12) : seekg(it, 28);
278+
279+
auto page_type = read_sas_data<uint16_t>(it, m_swap);
280+
//TODO
281+
// if ((page_type & sas_page_type_mask) == sas_page_type_data)
282+
// break;
283+
// if ((page_type & sas_page_type_comp))
284+
// continue;
285+
auto data_block_count = read_sas_data<uint16_t>(it, m_swap);
286+
auto subheader_pointers_count = read_sas_data<uint16_t>(it, m_swap);
287+
seekg(it, 2);
288+
auto subheader_pointer_size = m_u64 ? sas_subheader_pointer_size_64bit : sas_subheader_pointer_size_32bit;
289+
for (auto idx = 0; idx < subheader_pointers_count; idx++)
290+
{
291+
auto it = it + subheader_pointer_size;
292+
auto ret = parse_subheader_pointer(it);
293+
auto subheader_it = page_memory.begin() + std::get<0>(ret);
294+
if (signature == sas_subheader_signature_row_size) {
295+
if (std::get<1>(ret) < (m_u64 ? 128 : 64))
296+
throw std::runtime_error("");
297+
parse_row_size_subheader(subheader_it);
298+
} else if (signature == sas_subheader_signature_column_size) {
299+
if (std::get<1>(ret) < (m_u64 ? 16 : 8))
300+
throw std::runtime_error("");
301+
parse_column_size_subheader(subheader_it);
302+
} else if (signature == sas_subheader_signature_counts) {
303+
/* void */
304+
} else if (signature == sas_subheader_signature_column_text) {
305+
parse_column_text_subheader(subheader_it);
306+
} else if (signature == sas_subheader_signature_column_name) {
307+
parse_column_name_subheader(subheader_it, std::get<1>(ret));
308+
} else if (signature == sas_subheader_signature_column_attrs) {
309+
parse_column_attributes_subheader(subheader_it);
310+
} else if (signature == sas_subheader_signature_column_format) {
311+
parse_column_format_subheader(subheader_it);
312+
} else if (signature == sas_subheader_signature_column_list) {
313+
/* void */
314+
} else if ((signature & sas_subheader_signature_column_mask) == sas_subheader_signature_column_mask) {
315+
/* void */
316+
} else {
317+
/* void */
318+
}
319+
}
320+
}
321+
322+
std::tuple<uint64_t, uint64_t, uint8_t, uint8_t> xsas7bdat_reader::parse_subheader_pointer()
323+
{
324+
auto offset_to_subhead = 0;
325+
auto length = 0;
326+
auto compression = 0;
327+
auto subheader_type = 0;
328+
if (m_u64)
329+
{
330+
offset_to_subhead = read_sas_data<uint64_t>(it, m_swap);
331+
length = read_sas_data<uint64_t>(it, m_swap);
332+
compression = read_sas_data<uint8_t>(it, m_swap);
333+
subheader_type = read_sas_data<uint8_t>(it, m_swap);
334+
seekg(it, 7);
335+
}
336+
else
337+
{
338+
offset_to_subhead = read_sas_data<uint32_t>(it, m_swap);
339+
length = read_sas_data<uint32_t>(it, m_swap);
340+
compression = read_sas_data<uint8_t>(it, m_swap);
341+
subheader_type = read_sas_data<uint8_t>(it, m_swap);
342+
seekg(it, 3);
343+
}
344+
return std::make_tuple(offset_to_subhead, length, compression, subheader_type);
345+
}
346+
347+
void xsas7bdat_reader::parse_row_size_subheader(std::string::iterator& it)
348+
{
349+
m_u64 ? seekg(it, 40) : seekg(it, 20);
350+
m_row_length = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
351+
m_total_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
352+
m_u64 ? seekg(it, 72) : seekg(it, 36);
353+
m_max_row_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
354+
}
355+
356+
void xsas7bdat_reader::parse_column_size_subheader(std::string::iterator& it)
357+
{
358+
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
359+
}
360+
void xsas7bdat_reader::parse_column_text_subheader(std::string::iterator& it)
361+
{
362+
m_col_count = m_u64 ? read_sas_data<uint64_t>(it, m_swap) : read_sas_data<uint32_t>(it, m_swap);
363+
}
364+
365+
void xsas7bdat_reader::parse_column_name_subheader(std::string::iterator& it, uint64_t length)
366+
{
367+
int cmax = m_u64 ? (length - 28) / 8 : (length - 20) / 8;
368+
readstat_error_t retval = READSTAT_OK;
369+
}
370+
371+
// void xsas7bdat_reader::parse_column_attrs_subheader()
372+
// {
373+
// }
374+
375+
// void xsas7bdat_reader::parse_column_format_subheader()
376+
// {
377+
378+
// }
379+
380+
381+
}
382+
// inline xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>> xf::read_sas(const std::ifstream& ifs, const sas_format& format = sas_format::sas7bdata)
383+
// {
384+
// if (format == sas_format::sas7bdata)
385+
// {
386+
// detail::xsas7bdat_reader reader(ifs);
387+
// return reader.parse();
388+
// }
389+
// else
390+
// {
391+
392+
// }
393+
// return xf::xvariable<xtl::any, xf::xcoordinate<xf::fstring>();
394+
// }
395+
}
396+
397+
#endif

0 commit comments

Comments
 (0)