Skip to content

Commit 3cfc6ee

Browse files
authored
Retain up to microsecond precision when importing sas7bdat files (#330)
* Read fractions of seconds from SAS datasets * Add a test for a SAS file with fractional seconds time * Prevent rounding errors when processing datetime to polars Datetime values in SAS, SPSS and STATA are stored as a floating point number. Any operation risks introducing a rounding error. Use integer math in order to preserve original interpretation.
1 parent 7b90176 commit 3cfc6ee

4 files changed

Lines changed: 139 additions & 7 deletions

File tree

pyreadstat/_readstat_parser.pyx

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,13 @@ cdef object transform_datetime(py_datetime_format var_format, double tstamp, py_
233233
return mydat.date()
234234
elif var_format == DATE_FORMAT_DATETIME:
235235
if output_format == "polars":
236-
# we want to return seconds from unix
236+
# we want to return timestamp in seconds
237237
if file_format == FILE_FORMAT_STATA:
238238
# tstamp is in millisecons
239-
return (tstamp/1000) - unix_to_origin_secs
239+
return (tstamp/1000)
240240
else:
241241
# tstamp in seconds
242-
return tstamp - unix_to_origin_secs
242+
return tstamp
243243

244244
if file_format == FILE_FORMAT_STATA:
245245
# tstamp is in millisecons
@@ -253,7 +253,8 @@ cdef object transform_datetime(py_datetime_format var_format, double tstamp, py_
253253
# tstamp in seconds
254254
days = <int> (floor(tstamp / 86400))
255255
secs = <int> (tstamp % 86400)
256-
tdelta = timedelta_new(days, secs, 0)
256+
usecs = <int> (round(tstamp % 1 * 1e6))
257+
tdelta = timedelta_new(days, secs, usecs)
257258
#tdelta = timedelta(seconds=tstamp)
258259
mydat = origin + tdelta
259260
return mydat
@@ -270,7 +271,8 @@ cdef object transform_datetime(py_datetime_format var_format, double tstamp, py_
270271
# tstamp in seconds
271272
days = <int> (floor(tstamp / 86400))
272273
secs = <int> (tstamp % 86400)
273-
tdelta = timedelta_new(days, secs, 0)
274+
usecs = <int> (round(tstamp % 1 * 1e6))
275+
tdelta = timedelta_new(days, secs, usecs)
274276
#tdelta = timedelta(seconds=tstamp)
275277
mydat = origin + tdelta
276278
return mydat.time()
@@ -1105,7 +1107,16 @@ cdef object dict_to_dataframe(object dict_data, data_container dc):
11051107
if var_format == DATE_FORMAT_DATE:
11061108
date_cols.append(column)
11071109
if datetime_cols:
1108-
data_frame = data_frame.with_columns(pl.from_epoch(pl.col(*datetime_cols), time_unit='s'))
1110+
data_frame = data_frame.with_columns(
1111+
[
1112+
pl.from_epoch(
1113+
(pl.col(c) % 1 * 1e6).round().cast(pl.Int64) + (
1114+
pl.col(c).floor() * 1e6).cast(pl.Int64) - (
1115+
pl.lit(dc.unix_to_origin_secs) * 1e6).cast(pl.Int64),
1116+
time_unit='us')
1117+
for c in datetime_cols if data_frame[c].len() > 0
1118+
]
1119+
)
11091120
if date_cols:
11101121
data_frame = data_frame.with_columns(pl.from_epoch(pl.col(*date_cols), time_unit='d'))
11111122

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
date,dtime,time
2+
1993-06-10,1993-06-10T02:04:01.122463,02:04:01.122463
3+
2147-07-18,2147-07-18T02:16:02.883684,02:16:02.883684
4+
1739-04-22,1739-04-22T13:32:08.170115,13:32:08.170115
5+
2187-12-07,2187-12-07T15:41:41.567238,15:41:41.567238
6+
2278-04-20,2278-04-20T12:41:18.331215,12:41:18.331215
7+
2181-09-03,2181-09-03T16:20:48.795826,16:20:48.795826
8+
1992-01-21,1992-01-21T04:27:25.154369,04:27:25.154369
9+
1829-09-11,1829-09-11T10:47:37.282617,10:47:37.282618
10+
2294-10-29,2294-10-29T14:41:18.574982,14:41:18.574982
11+
1920-04-17,1920-04-17T16:31:07.566722,16:31:07.566723
12+
1868-07-28,1868-07-28T06:16:21.620745,06:16:21.620745
13+
2208-03-23,2208-03-23T05:13:20.026692,05:13:20.026692
14+
1929-05-15,1929-05-15T18:58:37.253180,18:58:37.253180
15+
1998-08-26,1998-08-26T02:03:58.743517,02:03:58.743517
16+
2122-11-24,2122-11-24T23:01:29.367663,23:01:29.367663
17+
1854-08-25,1854-08-25T01:56:49.077793,01:56:49.077793
18+
1936-01-09,1936-01-09T15:41:42.922773,15:41:42.922773
19+
1967-11-09,1967-11-09T15:23:02.071943,15:23:02.071943
20+
1703-11-02,1703-11-02T14:27:03.782796,14:27:03.782796
21+
1750-11-28,1750-11-28T08:52:49.219013,08:52:49.219013
22+
1734-07-09,1734-07-09T23:47:14.951282,23:47:14.951282
23+
2157-07-31,2157-07-31T21:52:15.157284,21:52:15.157284
24+
1950-01-07,1950-01-07T14:53:48.730921,14:53:48.730921
25+
1930-05-11,1930-05-11T03:24:48.220010,03:24:48.220011
26+
1772-06-29,1772-06-29T11:31:57.032263,11:31:57.032263
27+
2207-11-02,2207-11-02T16:23:11.818488,16:23:11.818488
28+
2275-05-17,2275-05-17T19:13:17.300388,19:13:17.300388
29+
1720-01-04,1720-01-04T17:34:54.322509,17:34:54.322509
30+
2273-01-24,2273-01-24T14:14:38.193558,14:14:38.193558
31+
1834-11-01,1834-11-01T06:04:17.738995,06:04:17.738995
32+
2266-10-02,2266-10-02T10:05:04.330017,10:05:04.330017
33+
1983-11-26,1983-11-26T12:30:57.127726,12:30:57.127726
34+
1967-11-22,1967-11-22T19:10:31.922508,19:10:31.922508
35+
1772-12-29,1772-12-29T17:17:40.509609,17:17:40.509609
36+
1920-08-05,1920-08-05T09:14:58.541695,09:14:58.541695
37+
2237-04-05,2237-04-05T11:10:54.366266,11:10:54.366266
38+
2285-04-22,2285-04-22T07:20:52.103394,07:20:52.103394
39+
1790-03-02,1790-03-02T11:20:35.978824,11:20:35.978824
40+
2222-09-28,2222-09-28T23:32:00.371672,23:32:00.371672
41+
2020-06-02,2020-06-02T11:24:03.381682,11:24:03.381682
42+
1942-09-06,1942-09-06T22:41:22.091431,22:41:22.091431
43+
1734-04-01,1734-04-01T02:06:18.583556,02:06:18.583556
44+
1710-04-28,1710-04-28T10:05:23.561800,10:05:23.561800
45+
1920-02-28,1920-02-28T14:46:28.076923,14:46:28.076923
46+
2059-01-28,2059-01-28T10:47:03.890320,10:47:03.890320
47+
1739-04-24,1739-04-24T22:59:28.010889,22:59:28.010889
48+
1911-07-10,1911-07-10T08:28:09.542829,08:28:09.542830
49+
2044-05-13,2044-05-13T15:42:33.791598,15:42:33.791598
50+
2281-02-02,2281-02-02T05:26:26.257008,05:26:26.257008
51+
1968-03-27,1968-03-27T22:34:47.154648,22:34:47.154648
52+
2237-11-09,2237-11-09T06:38:59.895151,06:38:59.895151
53+
1891-11-11,1891-11-11T03:51:21.527200,03:51:21.527201
54+
1769-05-08,1769-05-08T22:58:51.374555,22:58:51.374555
55+
2011-02-23,2011-02-23T20:37:24.734829,20:37:24.734829
56+
2211-07-10,2211-07-10T05:06:31.123780,05:06:31.123780
57+
2033-03-22,2033-03-22T06:28:12.726483,06:28:12.726483
58+
1824-10-31,1824-10-31T20:41:51.630083,20:41:51.630083
59+
1742-04-02,1742-04-02T22:15:15.392728,22:15:15.392728
60+
2286-01-29,2286-01-29T13:07:02.468262,13:07:02.468262
61+
1884-03-12,1884-03-12T07:45:14.166535,07:45:14.166534
62+
2096-04-26,2096-04-26T22:00:08.409092,22:00:08.409092
63+
1923-12-02,1923-12-02T13:15:39.641922,13:15:39.641922
64+
1723-06-12,1723-06-12T16:43:33.592489,16:43:33.592489
65+
2134-11-18,2134-11-18T06:18:01.839232,06:18:01.839232
66+
2269-02-20,2269-02-20T00:25:17.618843,00:25:17.618843
67+
1854-04-22,1854-04-22T05:25:48.059167,05:25:48.059166
68+
1968-07-13,1968-07-13T22:57:57.493756,22:57:57.493756
69+
1977-01-08,1977-01-08T09:30:29.495407,09:30:29.495407
70+
2010-12-19,2010-12-19T11:18:26.461924,11:18:26.461924
71+
2163-09-13,2163-09-13T18:33:24.278122,18:33:24.278122
72+
1730-01-15,1730-01-15T21:39:25.275543,21:39:25.275543
73+
1978-09-24,1978-09-24T23:11:39.162304,23:11:39.162304
74+
2224-02-08,2224-02-08T15:45:01.422703,15:45:01.422703
75+
1787-02-04,1787-02-04T11:01:01.320380,11:01:01.320380
76+
1964-01-18,1964-01-18T06:23:14.746125,06:23:14.746125
77+
1788-12-09,1788-12-09T09:30:16.346816,09:30:16.346816
78+
2290-09-07,2290-09-07T01:57:16.982105,01:57:16.982105
79+
2035-02-14,2035-02-14T23:34:56.107008,23:34:56.107008
80+
1905-02-18,1905-02-18T19:29:49.899169,19:29:49.899170
81+
2219-10-01,2219-10-01T20:28:45.250220,20:28:45.250220
82+
1781-08-02,1781-08-02T22:07:25.500849,22:07:25.500849
83+
1820-07-20,1820-07-20T13:07:18.717742,13:07:18.717742
84+
2171-03-31,2171-03-31T13:50:34.930294,13:50:34.930294
85+
2148-06-30,2148-06-30T07:32:48.692223,07:32:48.692223
86+
2093-03-04,2093-03-04T14:22:35.691149,14:22:35.691149
87+
2219-01-17,2219-01-17T20:32:47.025956,20:32:47.025956
88+
2032-06-27,2032-06-27T02:51:01.604809,02:51:01.604809
89+
1894-04-03,1894-04-03T00:15:54.122685,00:15:54.122684
90+
1960-02-06,1960-02-06T14:44:53.032016,14:44:53.032016
91+
1955-10-09,1955-10-09T18:14:44.323325,18:14:44.323324
92+
2195-06-21,2195-06-21T22:32:46.631441,22:32:46.631441
93+
1919-12-11,1919-12-11T19:11:02.762172,19:11:02.762173
94+
1741-05-21,1741-05-21T18:16:05.399772,18:16:05.399772
95+
1971-10-11,1971-10-11T00:14:23.048366,00:14:23.048366
96+
1810-04-22,1810-04-22T13:43:24.608671,13:43:24.608671
97+
2198-05-30,2198-05-30T12:31:55.632376,12:31:55.632376
98+
1774-04-27,1774-04-27T07:31:02.189986,07:31:02.189986
99+
2205-05-18,2205-05-18T10:39:57.032547,10:39:57.032547
100+
1875-08-15,1875-08-15T16:15:21.807336,16:15:21.807335
101+
2074-07-21,2074-07-21T08:35:02.723811,08:35:02.723811
128 KB
Binary file not shown.

tests/test_narwhalified.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,22 @@ def _prepare_data(self):
171171
self.df_sas_dates = df_dates2.to_native()
172172
#schema = {"date": nw.Date, "dtime": nw.Datetime("ns"), "time": nw.Time()}
173173
self.df_sas_dates2 = nw.concat([df_dates2, nw.from_dict({"date":[None], "dtime":[None], "time":[None]}, backend=backend)]).to_native() #, schema=schema
174+
175+
# datetime and time variables with fractional seconds as well as unusual date, time and datetime formats
176+
sas_fractional_seconds = os.path.join(self.basic_data_folder, "fractional_seconds.csv")
177+
if backend == "polars":
178+
kwds["try_parse_dates"] = True
179+
df_fractional_seconds_raw = nw.read_csv(sas_fractional_seconds,backend=backend, **kwds)
180+
df_fractional_seconds1 = df_fractional_seconds_raw.clone()
181+
df_fractional_seconds1 = df_fractional_seconds1.to_native()
182+
if backend == "pandas":
183+
df_fractional_seconds1["date"] = pd.to_datetime(df_fractional_seconds1["date"])
184+
df_fractional_seconds1["date"] = df_fractional_seconds1["date"].apply(lambda x: x.date())
185+
df_fractional_seconds1["dtime"] = pd.to_datetime(df_fractional_seconds1["dtime"])
186+
df_fractional_seconds1["time"] = pd.to_datetime(df_fractional_seconds1["time"], format='%H:%M:%S.%f')
187+
df_fractional_seconds1["time"] = df_fractional_seconds1["time"].apply(lambda x: x.time())
188+
self.df_sas_fractional_seconds = df_fractional_seconds1
189+
174190
# character column with nan and object column with nan (object pyreadstat writer doesn't know what to do with)
175191
if backend == "pandas":
176192
self.df_charnan = pd.DataFrame([[0,np.nan,np.nan],[1,"test", timedelta]], columns = ["integer", "string", "object"])
@@ -574,7 +590,11 @@ def test_sas_dates_as_pandas(self):
574590
sas_file = os.path.join(self.basic_data_folder, "dates.sas7bdat")
575591
df_sas, meta = pyreadstat.read_sas7bdat(sas_file, dates_as_pandas_datetime=True, output_format=self.backend)
576592
self.assertTrue(df_sas.equals(self.df_sas_dates_as_pandas))
577-
593+
594+
def test_sas_fractional_seconds(self):
595+
sas_file = os.path.join(self.basic_data_folder, "fractional_seconds.sas7bdat")
596+
df_sas, meta = pyreadstat.read_sas7bdat(sas_file, output_format=self.backend)
597+
self.assertTrue(df_sas.equals(self.df_sas_fractional_seconds))
578598

579599

580600
def test_sas_user_missing(self):

0 commit comments

Comments
 (0)