PyArrow Functionality

pandas can utilize PyArrow to extend functionality and improve the performance of various APIs. This includes:

  • More extensive data types compared to NumPy

  • Missing data support (NA) for all data types

  • Performant IO reader integration

  • Facilitate interoperability with other dataframe libraries based on the Apache Arrow specification (e.g. polars, cuDF)

To use this functionality, please ensure you have installed the minimum supported PyArrow version.

Data Structure Integration

A Series, Index, or the columns of a DataFrame can be directly backed by a which is similar to a NumPy array. To construct these from the main pandas data structures, you can pass in a string of the type followed by [pyarrow], e.g. "int64[pyarrow]"" into the dtype parameter

In [1]: ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    425     index = ensure_index(index)
    427 if dtype is not None:
--> 428     dtype = self._validate_dtype(dtype)
    430 if data is None:
    431     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
    456 """validate the passed dtype"""
    457 if dtype is not None:
--> 458     dtype = pandas_dtype(dtype)
    460     # a compound dtype
    461     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
    216 base_type = string[:-9]  # get rid of "[pyarrow]"
    217 try:
--> 218     pa_dtype = pa.type_for_alias(base_type)
    219 except ValueError as err:
    220     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [2]: ser
Out[2]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [3]: idx = pd.Index([True, None], dtype="bool[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 idx = pd.Index([True, None], dtype="bool[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/indexes/base.py:482, in Index.__new__(cls, data, dtype, copy, name, tupleize_cols)
    479 name = maybe_extract_name(name, data, cls)
    481 if dtype is not None:
--> 482     dtype = pandas_dtype(dtype)
    484 data_dtype = getattr(data, "dtype", None)
    486 refs = None

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
    216 base_type = string[:-9]  # get rid of "[pyarrow]"
    217 try:
--> 218     pa_dtype = pa.type_for_alias(base_type)
    219 except ValueError as err:
    220     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [4]: idx
Out[4]: [0.0, 1.0, 10.0]

In [5]: df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/frame.py:650, in DataFrame.__init__(self, data, index, columns, dtype, copy)
    641 def __init__(
    642     self,
    643     data=None,
   (...)
    647     copy: bool | None = None,
    648 ) -> None:
    649     if dtype is not None:
--> 650         dtype = self._validate_dtype(dtype)
    652     if isinstance(data, DataFrame):
    653         data = data._mgr

File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
    456 """validate the passed dtype"""
    457 if dtype is not None:
--> 458     dtype = pandas_dtype(dtype)
    460     # a compound dtype
    461     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
    216 base_type = string[:-9]  # get rid of "[pyarrow]"
    217 try:
--> 218     pa_dtype = pa.type_for_alias(base_type)
    219 except ValueError as err:
    220     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [6]: df
Out[6]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

Note

The string alias "string[pyarrow]" maps to pd.StringDtype("pyarrow") which is not equivalent to specifying dtype=pd.ArrowDtype(pa.string()). Generally, operations on the data will behave similarly except pd.StringDtype("pyarrow") can return NumPy-backed nullable types while pd.ArrowDtype(pa.string()) will return ArrowDtype.

In [7]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [8]: data = list("abc")

In [9]: ser_sd = pd.Series(data, dtype="string[pyarrow]")
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[9], line 1
----> 1 ser_sd = pd.Series(data, dtype="string[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    425     index = ensure_index(index)
    427 if dtype is not None:
--> 428     dtype = self._validate_dtype(dtype)
    430 if data is None:
    431     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
    456 """validate the passed dtype"""
    457 if dtype is not None:
--> 458     dtype = pandas_dtype(dtype)
    460     # a compound dtype
    461     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:163, in StringDtype.construct_from_string(cls, string)
    161     return cls(storage="python")
    162 elif string == "string[pyarrow]":
--> 163     return cls(storage="pyarrow")
    164 else:
    165     raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

File /usr/lib/python3/dist-packages/pandas/core/arrays/string_.py:117, in StringDtype.__init__(self, storage)
    113     raise ValueError(
    114         f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
    115     )
    116 if storage == "pyarrow" and pa_version_under7p0:
--> 117     raise ImportError(
    118         "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
    119     )
    120 self.storage = storage

ImportError: pyarrow>=7.0.0 is required for PyArrow backed StringArray.

In [10]: ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [11]: ser_ad.dtype == ser_sd.dtype
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 ser_ad.dtype == ser_sd.dtype

NameError: name 'ser_ad' is not defined

In [12]: ser_sd.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 1
----> 1 ser_sd.str.contains("a")

NameError: name 'ser_sd' is not defined

In [13]: ser_ad.str.contains("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[13], line 1
----> 1 ser_ad.str.contains("a")

NameError: name 'ser_ad' is not defined

For PyArrow types that accept parameters, you can pass in a PyArrow type with those parameters into ArrowDtype to use in the dtype parameter.

In [14]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[14], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [15]: list_str_type = pa.list_(pa.string())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 list_str_type = pa.list_(pa.string())

NameError: name 'pa' is not defined

In [16]: ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 1
----> 1 ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

NameError: name 'list_str_type' is not defined

In [17]: ser
Out[17]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [18]: from datetime import time

In [19]: idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[19], line 1
----> 1 idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

NameError: name 'pa' is not defined

In [20]: idx
Out[20]: [0.0, 1.0, 10.0]
In [21]: from decimal import Decimal

In [22]: decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

NameError: name 'pa' is not defined

In [23]: data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

In [24]: df = pd.DataFrame(data, dtype=decimal_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 1
----> 1 df = pd.DataFrame(data, dtype=decimal_type)

NameError: name 'decimal_type' is not defined

In [25]: df
Out[25]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

If you already have an or , you can pass it into arrays.ArrowExtensionArray to construct the associated Series, Index or DataFrame object.

In [26]: pa_array = pa.array(
   ....:     [{"1": "2"}, {"10": "20"}, None],
   ....:     type=pa.map_(pa.string(), pa.string()),
   ....: )
   ....: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 1
----> 1 pa_array = pa.array(
      2     [{"1": "2"}, {"10": "20"}, None],
      3     type=pa.map_(pa.string(), pa.string()),
      4 )

NameError: name 'pa' is not defined

In [27]: ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

NameError: name 'pa_array' is not defined

In [28]: ser
Out[28]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

To retrieve a pyarrow from a Series or Index, you can call the pyarrow array constructor on the Series or Index.

In [29]: ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    425     index = ensure_index(index)
    427 if dtype is not None:
--> 428     dtype = self._validate_dtype(dtype)
    430 if data is None:
    431     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
    456 """validate the passed dtype"""
    457 if dtype is not None:
--> 458     dtype = pandas_dtype(dtype)
    460     # a compound dtype
    461     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
    216 base_type = string[:-9]  # get rid of "[pyarrow]"
    217 try:
--> 218     pa_dtype = pa.type_for_alias(base_type)
    219 except ValueError as err:
    220     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [30]: pa.array(ser)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[30], line 1
----> 1 pa.array(ser)

NameError: name 'pa' is not defined

In [31]: idx = pd.Index(ser)

In [32]: pa.array(idx)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[32], line 1
----> 1 pa.array(idx)

NameError: name 'pa' is not defined

To convert a to a DataFrame, you can call the method with types_mapper=pd.ArrowDtype.

In [33]: table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

NameError: name 'pa' is not defined

In [34]: df = table.to_pandas(types_mapper=pd.ArrowDtype)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-34-64ec62289cb4> in ?()
----> 1 df = table.to_pandas(types_mapper=pd.ArrowDtype)

/usr/lib/python3/dist-packages/pandas/core/generic.py in ?(self, name)
   5985             and name not in self._accessors
   5986             and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5987         ):
   5988             return self[name]
-> 5989         return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'to_pandas'

In [35]: df
Out[35]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

In [36]: df.dtypes
Out[36]: 
a    object
b    object
dtype: object

Operations

PyArrow data structure integration is implemented through pandas’ ExtensionArray interface; therefore, supported functionality exists where this interface is integrated within the pandas API. Additionally, this functionality is accelerated with PyArrow compute functions where available. This includes:

  • Numeric aggregations

  • Numeric arithmetic

  • Numeric rounding

  • Logical and comparison functions

  • String functionality

  • Datetime functionality

The following are just some examples of operations that are accelerated by native PyArrow compute functions.

In [37]: import pyarrow as pa
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[37], line 1
----> 1 import pyarrow as pa

ModuleNotFoundError: No module named 'pyarrow'

In [38]: ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[38], line 1
----> 1 ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")

File /usr/lib/python3/dist-packages/pandas/core/series.py:428, in Series.__init__(self, data, index, dtype, name, copy, fastpath)
    425     index = ensure_index(index)
    427 if dtype is not None:
--> 428     dtype = self._validate_dtype(dtype)
    430 if data is None:
    431     index = index if index is not None else default_index(0)

File /usr/lib/python3/dist-packages/pandas/core/generic.py:458, in NDFrame._validate_dtype(cls, dtype)
    456 """validate the passed dtype"""
    457 if dtype is not None:
--> 458     dtype = pandas_dtype(dtype)
    460     # a compound dtype
    461     if dtype.kind == "V":

File /usr/lib/python3/dist-packages/pandas/core/dtypes/common.py:1679, in pandas_dtype(dtype)
   1676     return dtype
   1678 # registered extension types
-> 1679 result = registry.find(dtype)
   1680 if result is not None:
   1681     return result

File /usr/lib/python3/dist-packages/pandas/core/dtypes/base.py:521, in Registry.find(self, dtype)
    519 for dtype_type in self.dtypes:
    520     try:
--> 521         return dtype_type.construct_from_string(dtype)
    522     except TypeError:
    523         pass

File /usr/lib/python3/dist-packages/pandas/core/arrays/arrow/dtype.py:218, in ArrowDtype.construct_from_string(cls, string)
    216 base_type = string[:-9]  # get rid of "[pyarrow]"
    217 try:
--> 218     pa_dtype = pa.type_for_alias(base_type)
    219 except ValueError as err:
    220     has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)

NameError: name 'pa' is not defined

In [39]: ser.mean()
Out[39]: 2.0

In [40]: ser + ser
Out[40]: 
0    0.0
1    2.0
2    4.0
3    6.0
4    8.0
dtype: float64

In [41]: ser > (ser + 1)
Out[41]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [42]: ser.dropna()
Out[42]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [43]: ser.isna()
Out[43]: 
0    False
1    False
2    False
3    False
4    False
dtype: bool

In [44]: ser.fillna(0)
Out[44]: 
0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64
In [45]: ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[45], line 1
----> 1 ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))

NameError: name 'pa' is not defined

In [46]: ser_str.str.startswith("a")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[46], line 1
----> 1 ser_str.str.startswith("a")

NameError: name 'ser_str' is not defined
In [47]: from datetime import datetime

In [48]: pa_type = pd.ArrowDtype(pa.timestamp("ns"))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 pa_type = pd.ArrowDtype(pa.timestamp("ns"))

NameError: name 'pa' is not defined

In [49]: ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[49], line 1
----> 1 ser_dt = pd.Series([datetime(2022, 1, 1), None], dtype=pa_type)

NameError: name 'pa_type' is not defined

In [50]: ser_dt.dt.strftime("%Y-%m")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 ser_dt.dt.strftime("%Y-%m")

NameError: name 'ser_dt' is not defined

I/O Reading

PyArrow also provides IO reading functionality that has been integrated into several pandas IO readers. The following functions provide an engine keyword that can dispatch to PyArrow to accelerate reading from an IO source.

In [51]: import io

In [52]: data = io.StringIO("""a,b,c
   ....:    1,2.5,True
   ....:    3,4.5,False
   ....: """)
   ....: 

In [53]: df = pd.read_csv(data, engine="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:147, in import_optional_dependency(name, extra, errors, min_version)
    146 try:
--> 147     module = importlib.import_module(name)
    148 except ImportError:

File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1126, in _find_and_load_unlocked(name, import_)

File <frozen importlib._bootstrap>:241, in _call_with_frames_removed(f, *args, **kwds)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[53], line 1
----> 1 df = pd.read_csv(data, engine="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    899 kwds_defaults = _refine_defaults_read(
    900     dialect,
    901     delimiter,
   (...)
    908     dtype_backend=dtype_backend,
    909 )
    910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:583, in _read(filepath_or_buffer, kwds)
    580     return parser
    582 with parser:
--> 583     return parser.read(nrows)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1692, in TextFileReader.read(self, nrows)
   1689 if self.engine == "pyarrow":
   1690     try:
   1691         # error: "ParserBase" has no attribute "read"
-> 1692         df = self._engine.read()  # type: ignore[attr-defined]
   1693     except Exception:
   1694         self.close()

File /usr/lib/python3/dist-packages/pandas/io/parsers/arrow_parser_wrapper.py:149, in ArrowParserWrapper.read(self)
    138 def read(self) -> DataFrame:
    139     """
    140     Reads the contents of a CSV file into a DataFrame and
    141     processes it according to the kwargs passed in the
   (...)
    147         The DataFrame created from the CSV file.
    148     """
--> 149     pyarrow_csv = import_optional_dependency("pyarrow.csv")
    150     self._get_pyarrow_options()
    152     table = pyarrow_csv.read_csv(
    153         self.src,
    154         read_options=pyarrow_csv.ReadOptions(**self.read_options),
    155         parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
    156         convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
    157     )

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:150, in import_optional_dependency(name, extra, errors, min_version)
    148 except ImportError:
    149     if errors == "raise":
--> 150         raise ImportError(msg)
    151     return None
    153 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow.csv'.  Use pip or conda to install pyarrow.csv.

In [54]: df
Out[54]: 
     a    b
0  xxx  yyy
1   ¡¡   ¡¡

By default, these functions and all other IO reader functions return NumPy-backed data. These readers can return PyArrow-backed data by specifying the parameter dtype_backend="pyarrow". A reader does not need to set engine="pyarrow" to necessarily return PyArrow-backed data.

In [55]: import io

In [56]: data = io.StringIO("""a,b,c,d,e,f,g,h,i
   ....:     1,2.5,True,a,,,,,
   ....:     3,4.5,False,b,6,7.5,True,a,
   ....: """)
   ....: 

In [57]: df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:147, in import_optional_dependency(name, extra, errors, min_version)
    146 try:
--> 147     module = importlib.import_module(name)
    148 except ImportError:

File /usr/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[57], line 1
----> 1 df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:912, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    899 kwds_defaults = _refine_defaults_read(
    900     dialect,
    901     delimiter,
   (...)
    908     dtype_backend=dtype_backend,
    909 )
    910 kwds.update(kwds_defaults)
--> 912 return _read(filepath_or_buffer, kwds)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:577, in _read(filepath_or_buffer, kwds)
    574 _validate_names(kwds.get("names", None))
    576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
    579 if chunksize or iterator:
    580     return parser

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1407, in TextFileReader.__init__(self, f, engine, **kwds)
   1404     self.options["has_index_names"] = kwds["has_index_names"]
   1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)

File /usr/lib/python3/dist-packages/pandas/io/parsers/readers.py:1679, in TextFileReader._make_engine(self, f, engine)
   1676     raise ValueError(msg)
   1678 try:
-> 1679     return mapping[engine](f, **self.options)
   1680 except Exception:
   1681     if self.handles is not None:

File /usr/lib/python3/dist-packages/pandas/io/parsers/c_parser_wrapper.py:92, in CParserWrapper.__init__(self, src, **kwds)
     89     kwds["dtype_backend"] = "numpy"
     90 if kwds["dtype_backend"] == "pyarrow":
     91     # Fail here loudly instead of in cython after reading
---> 92     import_optional_dependency("pyarrow")
     93 self._reader = parsers.TextReader(src, **kwds)
     95 self.unnamed_cols = self._reader.unnamed_cols

File /usr/lib/python3/dist-packages/pandas/compat/_optional.py:150, in import_optional_dependency(name, extra, errors, min_version)
    148 except ImportError:
    149     if errors == "raise":
--> 150         raise ImportError(msg)
    151     return None
    153 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

In [58]: df_pyarrow.dtypes
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[58], line 1
----> 1 df_pyarrow.dtypes

NameError: name 'df_pyarrow' is not defined

Several non-IO reader functions can also use the dtype_backend argument to return PyArrow-backed data including: