The data type is not explicitely taken into account in the current JSON interface.
This NoteBook uses examples to present some key points
(active link on jupyter Notebook or Nbviewer)
This Notebook can also be viewed at nbviewer
import math
from pprint import pprint
from io import StringIO
import pandas as pd
from shapely.geometry import Point
from datetime import date, datetime, time
df = pd.read_json(StringIO('{"test integer":[1,2,3], "test string": ["a", "b", "c"]}'))
print(df)
# but it is impossible with to_json() to recreate the initial data
test integer test string 0 1 a 1 2 b 2 3 c
df = pd.DataFrame(pd.Series([10,20], name='test int32', dtype='Int32'))
# dtype is not included in usual json interface
df.to_json()
'{"test int32":{"0":10,"1":20}}'
# 'int32' is lost in json-table interface
df2 = pd.read_json(StringIO(df.to_json(orient='table')), orient='table')
print(df2.dtypes)
print('\nis Json translation reversible ? ', df.equals(df2))
test integer int64 test string object dtype: object is Json translation reversible ? True
df = pd.DataFrame(pd.Series([10,20], name='test float64', dtype='float64'))
print(df.dtypes, '\n')
df2 = pd.read_json(StringIO(df.to_json(orient='records')), orient='records')
print(df2.dtypes)
print('\nis Json translation reversible ? ', df.equals(df2))
test float64 float64 dtype: object test float64 int64 dtype: object is Json translation reversible ? False
sr = pd.Series([math.nan,math.nan], name='nan')
print(sr.dtype, '\n')
sr2 = pd.read_json(StringIO(sr.to_json()), typ='series')
print(sr2)
print('\nis Json translation reversible ? ', sr.equals(sr2))
float64 0 NaT 1 NaT dtype: datetime64[ns] is Json translation reversible ? False
dfd = pd.DataFrame({'test dates': [date(2021, 3, 1), date(2021, 3, 3)]})
print(dfd.to_json(default_handler=date.isoformat), '\n')
print(dfd.to_json(orient='table'), '\n')
dfd2 = pd.read_json(StringIO(dfd.to_json(orient='table')), orient='table')
print(dfd2)
print('\nis Json translation reversible ? ', dfd.equals(dfd2))
{"test dates":{"0":1614556800000,"1":1614729600000}} {"schema":{"fields":[{"name":"index","type":"integer"},{"name":"test dates","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"test dates":"2021-03-01T00:00:00.000"},{"index":1,"test dates":"2021-03-03T00:00:00.000"}]} test dates 0 2021-03-01T00:00:00.000 1 2021-03-03T00:00:00.000 is Json translation reversible ? False
dfd = pd.DataFrame({'test tuple': [(2021, 3, 1), (2021, 3, 3)]})
print(dfd, '\n')
print(dfd.to_json(), '\n')
print(dfd.to_json(orient='table'), '\n')
dfd2 = pd.read_json(StringIO(dfd.to_json(orient='table')), orient='table')
print(dfd2)
print('\nis Json translation reversible ? ', dfd.equals(dfd2))
test tuple 0 (2021, 3, 1) 1 (2021, 3, 3) {"test tuple":{"0":[2021,3,1],"1":[2021,3,3]}} {"schema":{"fields":[{"name":"index","type":"integer"},{"name":"test tuple","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"test tuple":[2021,3,1]},{"index":1,"test tuple":[2021,3,3]}]} test tuple 0 [2021, 3, 1] 1 [2021, 3, 3] is Json translation reversible ? False
df = pd.DataFrame(pd.Series([10,20], name='test float', dtype='float32'), dtype='category')
print(df.to_json(orient='table'))
{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"test float","type":"any","constraints":{"enum":[10.0,20.0]},"ordered":false}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"test float":10.0},{"index":1,"test float":20.0}]}
to_csv
and to_json
methods have different behaviorsdf = pd.DataFrame({'test dates' : [date(2021, 10, 2), date(2021, 10, 4)],
'test times' : [time(10, 10, 2), time(11, 10, 4)]})
print('CSV data :\n', df.to_csv())
print('JSON data :\n', df.to_json())
print('JSON data :\n', df.to_json(date_format='iso'))
CSV data : ,test dates,test times 0,2021-10-02,10:10:02 1,2021-10-04,11:10:04 JSON data : {"test dates":{"0":1633132800000,"1":1633305600000},"test times":{"0":"10:10:02","1":"11:10:04"}} JSON data : {"test dates":{"0":"2021-10-02T00:00:00.000","1":"2021-10-04T00:00:00.000"},"test times":{"0":"10:10:02","1":"11:10:04"}}
import ntv_pandas as npd
tab_data = {'dates': ['1964-01-01', '1985-02-05', '2022-01-21', '1964-01-01', '1985-02-05', '2022-01-21'],
'value': [10, 10, 20, 20, 30, 30],
'names': ['john', 'eric', 'judith', 'mila', 'hector', 'maria'],
'unique': [True, True, True, True, True, True] }
df = pd.DataFrame(tab_data, dtype='category')
print(df, '\n')
# length with compact interface : 240
print(npd.to_json(df, text=True))
print(len(npd.to_json(df, text=True)), '\n')
# length with actual interface : 946
print(df.to_json(orient='table'))
print(len(df.to_json(orient='table')), '\n')
dates value names unique 0 1964-01-01 10 john True 1 1985-02-05 10 eric True 2 2022-01-21 20 judith True 3 1964-01-01 20 mila True 4 1985-02-05 30 hector True 5 2022-01-21 30 maria True {':tab': {'index': [0, 1, 2, 3, 4, 5], 'dates': [['1964-01-01', '1985-02-05', '2022-01-21'], [1]], 'value': [[10, 20, 30], [2]], 'names': [['eric', 'hector', 'john', 'judith', 'maria', 'mila'], [2, 0, 3, 5, 1, 4]], 'unique': [[True], [6]]}} 1 {"schema":{"fields":[{"name":"index","type":"integer"},{"name":"dates","type":"any","constraints":{"enum":["1964-01-01","1985-02-05","2022-01-21"]},"ordered":false},{"name":"value","type":"any","constraints":{"enum":[10,20,30]},"ordered":false},{"name":"names","type":"any","constraints":{"enum":["eric","hector","john","judith","maria","mila"]},"ordered":false},{"name":"unique","type":"boolean","constraints":{"enum":[true]},"ordered":false}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"dates":"1964-01-01","value":10,"names":"john","unique":true},{"index":1,"dates":"1985-02-05","value":10,"names":"eric","unique":true},{"index":2,"dates":"2022-01-21","value":20,"names":"judith","unique":true},{"index":3,"dates":"1964-01-01","value":20,"names":"mila","unique":true},{"index":4,"dates":"1985-02-05","value":30,"names":"hector","unique":true},{"index":5,"dates":"2022-01-21","value":30,"names":"maria","unique":true}]} 946
constraints
or extDtype
members are added into table-schema headerpandas dtype |
pandas dtype reverse |
pandas Reversible (DataFrame are equal and dtype are equal) |
table-schema type |
---|---|---|---|
datetime64[ns, |
datetime64[ns, |
yes | datetime 'tz': |
datetime64[ns] | datetime64[ns] | yes | datetime |
int64 | int64 | yes | integer |
float64 | float64 | yes | number |
bool | bool | yes | boolean |
string | string | yes | any 'extDtype': 'string' |
category | category | depends on the dtype inside category | any 'enum': [xxxx] |
object | object | depends on value type | string |
Sparse[<dtype, fill>] | no | type (from dtype) 'extDtype': 'Sparse[<dtype, fill>]' |
|
int | int64 | no | integer |
Intxx, UIntxx | int64 | no | integer 'extDtype':dtype |
float | float64 | no | number |
float32 | float64 | no | number |
Float64 | float64 | no | number 'extDtype':dtype |
boolean | bool | no | boolean 'extDtype': 'boolean' |
period[ |
Read-json not available | no | datetime 'freq': 'M' |
timedelta64[ns] | Read_json not yet implemented | no | duration |
interval | not available | no | string |
reversibility :
df_reverse = pd.read_json(df.to_json(orient='table'), orient='table')
Data Data type |
table-schema format |
table-schema type |
pandas Specification orient=’table’ |
pandas read_json orient=’table’ |
---|---|---|---|---|
datetime | default (datetime ISO8601 in UTC) | datetime | datetime64[ns] | ok |
number | default | number | float64 | ok |
integer | default | integer | int64 | ok |
boolean | default | boolean | bool | ok |
string | default | string | object | ok |
custom type | default | any (custom type) | category / string | ok |
string | Format not supported | |||
uri | uri | string | Format not supported | |
binary | Binary (base64 string) | string | Format not supported | |
uuid | uuid | string | Format not supported | |
date, time or datetime with parsable format | any (parsable ?) | date, time, datetime | Format partially supported | |
date, time or datetime with custom format | <PATTERN> | date, time, datetime | Format not supported | |
duration | default (lexical duration ISO8601) | duration | timedelta64[ns] | Read_json not yet implemented |
Json data | default (json) | object | Unsupported | |
Json array | default (json array) | array | Unsupported | |
date | default (date ISO8601) | date | Unsupported | |
time | default (time ISO8601) | time | Unsupported | |
year | default | year | Unsupported | |
month | default | yearmonth | Unsupported | |
Point (string) | default (string “lon, lat”) | geopoint | Unsupported | |
Point (geojson array) | array (array [lon, lat]) | geopoint | Unsupported | |
Point (json object) | object (eg {"lon": 90, "lat": 45}) | geopoint | Unsupported | |
Geometry (geojson) | default (geojson spec) | geojson | Unsupported | |
Geometry (topojson) | Topojson (topojson spec) | geojson | Unsupported | |
Everything (custom type) | Only ‘any‘ is supported |
import json
data = [[{'test': [1,2,3]}, {'dtype': 'int32'}],
[{'test': [1,2,3]}, {'dtype': 'int'}],
[{'test': [1,2,3]}, {'dtype': 'int64'}],
[{'test': [1,2,3]}, {'dtype': 'Int64'}],
[{'test': [1,2,3]}, {'dtype': 'UInt64'}],
[{'test': [1,2,3]}, {'dtype': 'float'}],
[{'test': [1,2,3]}, {'dtype': 'Float32'}],
[{'test': [1,2,3]}, {'dtype': 'float32'}],
[{'test': [1,2,3]}, {'dtype': 'float64'}],
[{'test': [1,2,3]}, {'dtype': 'Sparse[float64]'}],
# [{'test': [pd.Interval(1,2), pd.Interval(2,3), pd.Interval(3,4)]}], # read_json ko
[{'test': ['2020-01-01']}, {'dtype': 'datetime64[ns]'}],
[{'test': ['2020-01-01']}, {'dtype': 'datetime64[ns, UTC]'}],
[{'test': ['2020-01-01']}, {'dtype': 'category'}],
[{'test': [datetime(2020, 1, 1)]}, {'dtype': 'category'}],
[{'test': [True, False]}, {'dtype': 'boolean'}],
[{'test': [True, False]}, {'dtype': 'bool'}],
# [{'test': ['1 days', '2 days']}, {'dtype': 'timedelta64[ns]'}], # read_json not yet implemented
# [{'test': ['2020-01-01', '2020-02-01', '2020-03-01']}, {'dtype': 'period[M]'}], # read_json not available
[{'test': [True, 1, 'er', datetime(2020, 1, 1)]}, {'dtype': 'object'}],
]
print('reverse (True, False), json table-schema object, reverse dtype:\n')
for df_data in data:
df = pd.DataFrame(df_data[0], **df_data[1]) if len(df_data) == 2 else pd.DataFrame(df_data[0])
js = df.to_json(orient='table')
df2 = pd.read_json(StringIO(js), orient='table')
print(df.equals(df2), json.loads(js)['schema']['fields'][1], df2.dtypes.iloc[0])
reverse (True, False), json table-schema object, reverse dtype: False {'name': 'test', 'type': 'integer'} int64 False {'name': 'test', 'type': 'integer'} int64 True {'name': 'test', 'type': 'integer'} int64 True {'name': 'test', 'type': 'integer', 'extDtype': 'Int64'} Int64 True {'name': 'test', 'type': 'integer', 'extDtype': 'UInt64'} UInt64 True {'name': 'test', 'type': 'number'} float64 True {'name': 'test', 'type': 'number', 'extDtype': 'Float32'} Float32 False {'name': 'test', 'type': 'number'} float64 True {'name': 'test', 'type': 'number'} float64 True {'name': 'test', 'type': 'number', 'extDtype': 'Sparse[float64, nan]'} Sparse[float64, nan] True {'name': 'test', 'type': 'datetime'} datetime64[ns] True {'name': 'test', 'type': 'datetime', 'tz': 'UTC'} datetime64[ns, UTC] True {'name': 'test', 'type': 'any', 'constraints': {'enum': ['2020-01-01']}, 'ordered': False} category False {'name': 'test', 'type': 'any', 'constraints': {'enum': ['2020-01-01T00:00:00.000']}, 'ordered': False} category True {'name': 'test', 'type': 'boolean', 'extDtype': 'boolean'} boolean True {'name': 'test', 'type': 'boolean'} bool False {'name': 'test', 'type': 'string'} object