PyArrow Functionality¶
pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:
More extensive data types compared to NumPy
Missing data support (NA) for all data types
Performant IO reader integration
Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)
To use this functionality, please ensure you have installed the minimum supported PyArrow version.
Data Structure Integration¶
A Series
, Index
, or the columns of a DataFrame
can be directly backed by a
which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by
[pyarrow]
, e.g. "int64[pyarrow]""
into the dtype
parameter
In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 1
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
425 index = ensure_index(index)
427 if dtype is not None:
--> 428 dtype = self._validate_dtype(dtype)
430 if data is None:
431 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
456 """validate the passed dtype"""
457 if dtype is not None:
--> 458 dtype = pandas_dtype(dtype)
460 # a compound dtype
461 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
216 base_type = string[:-9] # get rid of "[pyarrow]"
217 try:
--> 218 pa_dtype = pa.type_for_alias(base_type)
219 except ValueError as err:
220 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [2]: ser
Out[2]:
0 0.0
1 1.0
2 2.0
3 3.0
4 4.0
dtype: float64
In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:482, in Index.__new__(cls, data, dtype, copy, name, tupleize_cols)
479 name = maybe_extract_name(name, data, cls)
481 if dtype is not None:
--> 482 dtype = pandas_dtype(dtype)
484 data_dtype = getattr(data, "dtype", None)
486 refs = None
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
216 base_type = string[:-9] # get rid of "[pyarrow]"
217 try:
--> 218 pa_dtype = pa.type_for_alias(base_type)
219 except ValueError as err:
220 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [4]: idx
Out[4]: [0.0, 1.0, 10.0]
In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/frame.py:650, in DataFrame.__init__(self, data, index, columns, dtype, copy)
641 def __init__(
642 self,
643 data=None,
(...)
647 copy: bool | None = None,
648 ) -> None:
649 if dtype is not None:
--> 650 dtype = self._validate_dtype(dtype)
652 if isinstance(data, DataFrame):
653 data = data._mgr
File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
456 """validate the passed dtype"""
457 if dtype is not None:
--> 458 dtype = pandas_dtype(dtype)
460 # a compound dtype
461 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
216 base_type = string[:-9] # get rid of "[pyarrow]"
217 try:
--> 218 pa_dtype = pa.type_for_alias(base_type)
219 except ValueError as err:
220 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [6]: df
Out[6]:
a b
0 xxx yyy
1 ¡¡ ¡¡
Note
The string alias "string[pyarrow]"
maps to pd.StringDtype("pyarrow")
which is not equivalent to
specifying dtype=pd.ArrowDtype(pa.string())
. Generally, operations on the data will behave similarly
except pd.StringDtype("pyarrow")
can return NumPy-backed nullable types while pd.ArrowDtype(pa.string())
will return ArrowDtype
.
In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [8]: data = list("abc")
In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[9], line 1
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
425 index = ensure_index(index)
427 if dtype is not None:
--> 428 dtype = self._validate_dtype(dtype)
430 if data is None:
431 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
456 """validate the passed dtype"""
457 if dtype is not None:
--> 458 dtype = pandas_dtype(dtype)
460 # a compound dtype
461 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:163, in StringDtype.construct_from_string(cls, string)
161 return cls(storage="python")
162 elif string == "string[pyarrow]":
--> 163 return cls(storage="pyarrow")
164 else:
165 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:117, in StringDtype.__init__(self, storage)
113 raise ValueError(
114 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
115 )
116 if storage == "pyarrow" and pa_version_under7p0:
--> 117 raise ImportError(
118 "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
119 )
120 self.storage = storage
ImportError: pyarrow>=7.0.0 is required for PyArrow backed StringArray.
In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
NameError: name 'pa' is not defined
In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 1
----> 1 ser_ad.dtype == ser_sd.dtype
NameError: name 'ser_ad' is not defined
In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 1
----> 1 ser_sd.str.contains("a")
NameError: name 'ser_sd' is not defined
In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 1
----> 1 ser_ad.str.contains("a")
NameError: name 'ser_ad' is not defined
For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters
into ArrowDtype
to use in the dtype
parameter.
In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[14], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 1
----> 1 list_str_type = pa.list_(pa.string())
NameError: name 'pa' is not defined
In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 1
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
NameError: name 'list_str_type' is not defined
In [17]: ser
Out[17]:
0 0.0
1 1.0
2 2.0
3 3.0
4 4.0
dtype: float64
In [18]: from datetime import time
In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 1
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
NameError: name 'pa' is not defined
In [20]: idx
Out[20]: [0.0, 1.0, 10.0]
In [21]: from decimal import Decimal
In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 1
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
NameError: name 'pa' is not defined
In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 1
----> 1 df = pd.DataFrame(data, dtype=decimal_type)
NameError: name 'decimal_type' is not defined
In [25]: df
Out[25]:
a b
0 xxx yyy
1 ¡¡ ¡¡
If you already have an or ,
you can pass it into arrays.ArrowExtensionArray
to construct the associated Series
, Index
or DataFrame
object.
In [26]: pa_array = pa.array(
....: [{"1": "2"}, {"10": "20"}, None],
....: type=pa.map_(pa.string(), pa.string()),
....: )
....:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[26], line 1
----> 1 pa_array = pa.array(
2 [{"1": "2"}, {"10": "20"}, None],
3 type=pa.map_(pa.string(), pa.string()),
4 )
NameError: name 'pa' is not defined
In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 1
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
NameError: name 'pa_array' is not defined
In [28]: ser
Out[28]:
0 0.0
1 1.0
2 2.0
3 3.0
4 4.0
dtype: float64
To retrieve a pyarrow from a Series
or Index
, you can call
the pyarrow array constructor on the Series
or Index
.
In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 1
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
425 index = ensure_index(index)
427 if dtype is not None:
--> 428 dtype = self._validate_dtype(dtype)
430 if data is None:
431 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
456 """validate the passed dtype"""
457 if dtype is not None:
--> 458 dtype = pandas_dtype(dtype)
460 # a compound dtype
461 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
216 base_type = string[:-9] # get rid of "[pyarrow]"
217 try:
--> 218 pa_dtype = pa.type_for_alias(base_type)
219 except ValueError as err:
220 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[30], line 1
----> 1 pa.array(ser)
NameError: name 'pa' is not defined
In [31]: idx = pd.Index(ser)
In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[32], line 1
----> 1 pa.array(idx)
NameError: name 'pa' is not defined
To convert a to a DataFrame
, you can call the
method with types_mapper=pd.ArrowDtype
.
In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[33], line 1
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
NameError: name 'pa' is not defined
In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-64ec62289cb4> in ?()
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)
/usr/lib/python3/dist-packages/pandas/core/generic.py in ?(self, name)
5985 and name not in self._accessors
5986 and self._info_axis._can_hold_identifiers_and_holds_name(name)
5987 ):
5988 return self[name]
-> 5989 return object.__getattribute__(self, name)
AttributeError: 'DataFrame' object has no attribute 'to_pandas'
In [35]: df
Out[35]:
a b
0 xxx yyy
1 ¡¡ ¡¡
In [36]: df.dtypes
Out[36]:
a object
b object
dtype: object
Operations¶
PyArrow data structure integration is implemented through pandas’ ExtensionArray
interface;
therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality
is accelerated with PyArrow compute functions where available. This includes:
Numeric aggregations
Numeric arithmetic
Numeric rounding
Logical and comparison functions
String functionality
Datetime functionality
The following are just some examples of operations that are accelerated by native PyArrow compute functions.
In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[37], line 1
----> 1 import pyarrow as pa
ModuleNotFoundError: No module named 'pyarrow'
In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[38], line 1
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
425 index = ensure_index(index)
427 if dtype is not None:
--> 428 dtype = self._validate_dtype(dtype)
430 if data is None:
431 index = index if index is not None else default_index(0)
File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
456 """validate the passed dtype"""
457 if dtype is not None:
--> 458 dtype = pandas_dtype(dtype)
460 # a compound dtype
461 if dtype.kind == "V":
File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
1676 return dtype
1678 # registered extension types
-> 1679 result = registry.find(dtype)
1680 if result is not None:
1681 return result
File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
519 for dtype_type in self.dtypes:
520 try:
--> 521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
216 base_type = string[:-9] # get rid of "[pyarrow]"
217 try:
--> 218 pa_dtype = pa.type_for_alias(base_type)
219 except ValueError as err:
220 has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
NameError: name 'pa' is not defined
In [39]: ser.mean()
Out[39]: 2.0
In [40]: ser + ser
Out[40]:
0 0.0
1 2.0
2 4.0
3 6.0
4 8.0
dtype: float64
In [41]: ser > (ser + 1)
Out[41]:
0 False
1 False
2 False
3 False
4 False
dtype: bool
In [42]: ser.dropna()
Out[42]:
0 0.0
1 1.0
2 2.0
3 3.0
4 4.0
dtype: float64
In [43]: ser.isna()
Out[43]:
0 False
1 False
2 False
3 False
4 False
dtype: bool
In [44]: ser.fillna(0)
Out[44]:
0 0.0
1 1.0
2 2.0
3 3.0
4 4.0
dtype: float64
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[45], line 1
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
NameError: name 'pa' is not defined
In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[46], line 1
----> 1 ser_str.str.startswith("a")
NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime
In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[48], line 1
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))
NameError: name 'pa' is not defined
In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[49], line 1
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
NameError: name 'pa_type' is not defined
In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[50], line 1
----> 1 ser_dt.dt.strftime("%Y-%m")
NameError: name 'ser_dt' is not defined
I/O Reading¶
PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following
functions provide an engine
keyword that can dispatch to PyArrow to accelerate reading from an IO source.
In [51]: import io
In [52]: data = io.StringIO("""a,b,c
....: 1,2.5,True
....: 3,4.5,False
....: """)
....:
In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:147, in import_optional_dependency(name, extra, errors, min_version)
146 try:
--> 147 module = importlib.import_module(name)
148 except ImportError:
File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1126, in _find_and_load_unlocked(name, import_)
File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'pyarrow'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Cell In[53], line 1
----> 1 df = pd.read_csv(data, engine="pyarrow")
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
899 kwds_defaults = _refine_defaults_read(
900 dialect,
901 delimiter,
(...)
908 dtype_backend=dtype_backend,
909 )
910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:583, in _read(filepath_or_buffer, kwds)
580 return parser
582 with parser:
--> 583 return parser.read(nrows)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1692, in TextFileReader.read(self, nrows)
1689 if self.engine == "pyarrow":
1690 try:
1691 # error: "ParserBase" has no attribute "read"
-> 1692 df = self._engine.read() # type: ignore[attr-defined]
1693 except Exception:
1694 self.close()
File /usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py:149, in ArrowParserWrapper.read(self)
138 def read(self) -> DataFrame:
139 """
140 Reads the contents of a CSV file into a DataFrame and
141 processes it according to the kwargs passed in the
(...)
147 The DataFrame created from the CSV file.
148 """
--> 149 pyarrow_csv = import_optional_dependency("pyarrow.csv")
150 self._get_pyarrow_options()
152 table = pyarrow_csv.read_csv(
153 self.src,
154 read_options=pyarrow_csv.ReadOptions(**self.read_options),
155 parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
156 convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
157 )
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:150, in import_optional_dependency(name, extra, errors, min_version)
148 except ImportError:
149 if errors == "raise":
--> 150 raise ImportError(msg)
151 return None
153 # Handle submodules: if we have submodule, grab parent module from sys.modules
ImportError: Missing optional dependency 'pyarrow.csv'. Use pip or conda to install pyarrow.csv.
In [54]: df
Out[54]:
a b
0 xxx yyy
1 ¡¡ ¡¡
By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return
PyArrow-backed data by specifying the parameter dtype_backend="pyarrow"
. A reader does not need to set
engine="pyarrow"
to necessarily return PyArrow-backed data.
In [55]: import io
In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
....: 1,2.5,True,a,,,,,
....: 3,4.5,False,b,6,7.5,True,a,
....: """)
....:
In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:147, in import_optional_dependency(name, extra, errors, min_version)
146 try:
--> 147 module = importlib.import_module(name)
148 except ImportError:
File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
125 level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'pyarrow'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Cell In[57], line 1
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
899 kwds_defaults = _refine_defaults_read(
900 dialect,
901 delimiter,
(...)
908 dtype_backend=dtype_backend,
909 )
910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:577, in _read(filepath_or_buffer, kwds)
574 _validate_names(kwds.get("names", None))
576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
579 if chunksize or iterator:
580 return parser
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1407, in TextFileReader.__init__(self, f, engine, **kwds)
1404 self.options["has_index_names"] = kwds["has_index_names"]
1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)
File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1679, in TextFileReader._make_engine(self, f, engine)
1676 raise ValueError(msg)
1678 try:
-> 1679 return mapping[engine](f, **self.options)
1680 except Exception:
1681 if self.handles is not None:
File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:92, in CParserWrapper.__init__(self, src, **kwds)
89 kwds["dtype_backend"] = "numpy"
90 if kwds["dtype_backend"] == "pyarrow":
91 # Fail here loudly instead of in cython after reading
---> 92 import_optional_dependency("pyarrow")
93 self._reader = parsers.TextReader(src, **kwds)
95 self.unnamed_cols = self._reader.unnamed_cols
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:150, in import_optional_dependency(name, extra, errors, min_version)
148 except ImportError:
149 if errors == "raise":
--> 150 raise ImportError(msg)
151 return None
153 # Handle submodules: if we have submodule, grab parent module from sys.modules
ImportError: Missing optional dependency 'pyarrow'. Use pip or conda to install pyarrow.
In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[58], line 1
----> 1 df_pyarrow.dtypes
NameError: name 'df_pyarrow' is not defined
Several non-IO reader functions can also use the dtype_backend
argument to return PyArrow-backed data including: