test_expressions.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. import operator
  2. import re
  3. import numpy as np
  4. import pytest
  5. import pandas._testing as tm
  6. from pandas.core.api import (
  7. DataFrame,
  8. Index,
  9. Series,
  10. )
  11. from pandas.core.computation import expressions as expr
  12. _frame = DataFrame(np.random.randn(10001, 4), columns=list("ABCD"), dtype="float64")
  13. _frame2 = DataFrame(np.random.randn(100, 4), columns=list("ABCD"), dtype="float64")
  14. _mixed = DataFrame(
  15. {
  16. "A": _frame["A"].copy(),
  17. "B": _frame["B"].astype("float32"),
  18. "C": _frame["C"].astype("int64"),
  19. "D": _frame["D"].astype("int32"),
  20. }
  21. )
  22. _mixed2 = DataFrame(
  23. {
  24. "A": _frame2["A"].copy(),
  25. "B": _frame2["B"].astype("float32"),
  26. "C": _frame2["C"].astype("int64"),
  27. "D": _frame2["D"].astype("int32"),
  28. }
  29. )
  30. _integer = DataFrame(
  31. np.random.randint(1, 100, size=(10001, 4)), columns=list("ABCD"), dtype="int64"
  32. )
  33. _integer2 = DataFrame(
  34. np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64"
  35. )
  36. _array = _frame["A"].values.copy()
  37. _array2 = _frame2["A"].values.copy()
  38. _array_mixed = _mixed["D"].values.copy()
  39. _array_mixed2 = _mixed2["D"].values.copy()
  40. @pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr")
  41. class TestExpressions:
  42. def setup_method(self, method):
  43. self.frame = _frame.copy()
  44. self.frame2 = _frame2.copy()
  45. self.mixed = _mixed.copy()
  46. self.mixed2 = _mixed2.copy()
  47. self._MIN_ELEMENTS = expr._MIN_ELEMENTS
  48. def teardown_method(self, method):
  49. expr._MIN_ELEMENTS = self._MIN_ELEMENTS
  50. @staticmethod
  51. def call_op(df, other, flex: bool, opname: str):
  52. if flex:
  53. op = lambda x, y: getattr(x, opname)(y)
  54. op.__name__ = opname
  55. else:
  56. op = getattr(operator, opname)
  57. expr.set_use_numexpr(False)
  58. expected = op(df, other)
  59. expr.set_use_numexpr(True)
  60. expr.get_test_result()
  61. result = op(df, other)
  62. return result, expected
  63. def run_arithmetic(self, df, other, flex: bool):
  64. expr._MIN_ELEMENTS = 0
  65. operations = ["add", "sub", "mul", "mod", "truediv", "floordiv"]
  66. for arith in operations:
  67. result, expected = self.call_op(df, other, flex, arith)
  68. if arith == "truediv":
  69. if expected.ndim == 1:
  70. assert expected.dtype.kind == "f"
  71. else:
  72. assert all(x.kind == "f" for x in expected.dtypes.values)
  73. tm.assert_equal(expected, result)
  74. def run_binary(self, df, other, flex: bool):
  75. """
  76. tests solely that the result is the same whether or not numexpr is
  77. enabled. Need to test whether the function does the correct thing
  78. elsewhere.
  79. """
  80. expr._MIN_ELEMENTS = 0
  81. expr.set_test_mode(True)
  82. operations = ["gt", "lt", "ge", "le", "eq", "ne"]
  83. for arith in operations:
  84. result, expected = self.call_op(df, other, flex, arith)
  85. used_numexpr = expr.get_test_result()
  86. assert used_numexpr, "Did not use numexpr as expected."
  87. tm.assert_equal(expected, result)
  88. def run_frame(self, df, other, flex: bool):
  89. self.run_arithmetic(df, other, flex)
  90. expr.set_use_numexpr(False)
  91. binary_comp = other + 1
  92. expr.set_use_numexpr(True)
  93. self.run_binary(df, binary_comp, flex)
  94. for i in range(len(df.columns)):
  95. self.run_arithmetic(df.iloc[:, i], other.iloc[:, i], flex)
  96. # FIXME: dont leave commented-out
  97. # series doesn't uses vec_compare instead of numexpr...
  98. # binary_comp = other.iloc[:, i] + 1
  99. # self.run_binary(df.iloc[:, i], binary_comp, flex)
  100. @pytest.mark.parametrize(
  101. "df",
  102. [
  103. _integer,
  104. _integer2,
  105. # randint to get a case with zeros
  106. _integer * np.random.randint(0, 2, size=np.shape(_integer)),
  107. _frame,
  108. _frame2,
  109. _mixed,
  110. _mixed2,
  111. ],
  112. )
  113. @pytest.mark.parametrize("flex", [True, False])
  114. def test_arithmetic(self, df, flex):
  115. self.run_frame(df, df, flex)
  116. def test_invalid(self):
  117. array = np.random.randn(1_000_001)
  118. array2 = np.random.randn(100)
  119. # no op
  120. result = expr._can_use_numexpr(operator.add, None, array, array, "evaluate")
  121. assert not result
  122. # min elements
  123. result = expr._can_use_numexpr(operator.add, "+", array2, array2, "evaluate")
  124. assert not result
  125. # ok, we only check on first part of expression
  126. result = expr._can_use_numexpr(operator.add, "+", array, array2, "evaluate")
  127. assert result
  128. @pytest.mark.parametrize(
  129. "opname,op_str",
  130. [("add", "+"), ("sub", "-"), ("mul", "*"), ("truediv", "/"), ("pow", "**")],
  131. )
  132. @pytest.mark.parametrize(
  133. "left,right", [(_array, _array2), (_array_mixed, _array_mixed2)]
  134. )
  135. def test_binary_ops(self, opname, op_str, left, right):
  136. def testit():
  137. if opname == "pow":
  138. # TODO: get this working
  139. return
  140. op = getattr(operator, opname)
  141. result = expr.evaluate(op, left, left, use_numexpr=True)
  142. expected = expr.evaluate(op, left, left, use_numexpr=False)
  143. tm.assert_numpy_array_equal(result, expected)
  144. result = expr._can_use_numexpr(op, op_str, right, right, "evaluate")
  145. assert not result
  146. expr.set_use_numexpr(False)
  147. testit()
  148. expr.set_use_numexpr(True)
  149. expr.set_numexpr_threads(1)
  150. testit()
  151. expr.set_numexpr_threads()
  152. testit()
  153. @pytest.mark.parametrize(
  154. "opname,op_str",
  155. [
  156. ("gt", ">"),
  157. ("lt", "<"),
  158. ("ge", ">="),
  159. ("le", "<="),
  160. ("eq", "=="),
  161. ("ne", "!="),
  162. ],
  163. )
  164. @pytest.mark.parametrize(
  165. "left,right", [(_array, _array2), (_array_mixed, _array_mixed2)]
  166. )
  167. def test_comparison_ops(self, opname, op_str, left, right):
  168. def testit():
  169. f12 = left + 1
  170. f22 = right + 1
  171. op = getattr(operator, opname)
  172. result = expr.evaluate(op, left, f12, use_numexpr=True)
  173. expected = expr.evaluate(op, left, f12, use_numexpr=False)
  174. tm.assert_numpy_array_equal(result, expected)
  175. result = expr._can_use_numexpr(op, op_str, right, f22, "evaluate")
  176. assert not result
  177. expr.set_use_numexpr(False)
  178. testit()
  179. expr.set_use_numexpr(True)
  180. expr.set_numexpr_threads(1)
  181. testit()
  182. expr.set_numexpr_threads()
  183. testit()
  184. @pytest.mark.parametrize("cond", [True, False])
  185. @pytest.mark.parametrize("df", [_frame, _frame2, _mixed, _mixed2])
  186. def test_where(self, cond, df):
  187. def testit():
  188. c = np.empty(df.shape, dtype=np.bool_)
  189. c.fill(cond)
  190. result = expr.where(c, df.values, df.values + 1)
  191. expected = np.where(c, df.values, df.values + 1)
  192. tm.assert_numpy_array_equal(result, expected)
  193. expr.set_use_numexpr(False)
  194. testit()
  195. expr.set_use_numexpr(True)
  196. expr.set_numexpr_threads(1)
  197. testit()
  198. expr.set_numexpr_threads()
  199. testit()
  200. @pytest.mark.parametrize(
  201. "op_str,opname", [("/", "truediv"), ("//", "floordiv"), ("**", "pow")]
  202. )
  203. def test_bool_ops_raise_on_arithmetic(self, op_str, opname):
  204. df = DataFrame({"a": np.random.rand(10) > 0.5, "b": np.random.rand(10) > 0.5})
  205. msg = f"operator '{opname}' not implemented for bool dtypes"
  206. f = getattr(operator, opname)
  207. err_msg = re.escape(msg)
  208. with pytest.raises(NotImplementedError, match=err_msg):
  209. f(df, df)
  210. with pytest.raises(NotImplementedError, match=err_msg):
  211. f(df.a, df.b)
  212. with pytest.raises(NotImplementedError, match=err_msg):
  213. f(df.a, True)
  214. with pytest.raises(NotImplementedError, match=err_msg):
  215. f(False, df.a)
  216. with pytest.raises(NotImplementedError, match=err_msg):
  217. f(False, df)
  218. with pytest.raises(NotImplementedError, match=err_msg):
  219. f(df, True)
  220. @pytest.mark.parametrize(
  221. "op_str,opname", [("+", "add"), ("*", "mul"), ("-", "sub")]
  222. )
  223. def test_bool_ops_warn_on_arithmetic(self, op_str, opname):
  224. n = 10
  225. df = DataFrame({"a": np.random.rand(n) > 0.5, "b": np.random.rand(n) > 0.5})
  226. subs = {"+": "|", "*": "&", "-": "^"}
  227. sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}
  228. f = getattr(operator, opname)
  229. fe = getattr(operator, sub_funcs[subs[op_str]])
  230. if op_str == "-":
  231. # raises TypeError
  232. return
  233. with tm.use_numexpr(True, min_elements=5):
  234. with tm.assert_produces_warning(check_stacklevel=False):
  235. r = f(df, df)
  236. e = fe(df, df)
  237. tm.assert_frame_equal(r, e)
  238. with tm.assert_produces_warning(check_stacklevel=False):
  239. r = f(df.a, df.b)
  240. e = fe(df.a, df.b)
  241. tm.assert_series_equal(r, e)
  242. with tm.assert_produces_warning(check_stacklevel=False):
  243. r = f(df.a, True)
  244. e = fe(df.a, True)
  245. tm.assert_series_equal(r, e)
  246. with tm.assert_produces_warning(check_stacklevel=False):
  247. r = f(False, df.a)
  248. e = fe(False, df.a)
  249. tm.assert_series_equal(r, e)
  250. with tm.assert_produces_warning(check_stacklevel=False):
  251. r = f(False, df)
  252. e = fe(False, df)
  253. tm.assert_frame_equal(r, e)
  254. with tm.assert_produces_warning(check_stacklevel=False):
  255. r = f(df, True)
  256. e = fe(df, True)
  257. tm.assert_frame_equal(r, e)
  258. @pytest.mark.parametrize(
  259. "test_input,expected",
  260. [
  261. (
  262. DataFrame(
  263. [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"]
  264. ),
  265. DataFrame([[False, False], [False, False]], columns=["a", "dtype"]),
  266. ),
  267. (
  268. DataFrame(
  269. [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]],
  270. columns=["a", "b", "c", "dtype"],
  271. ),
  272. DataFrame(
  273. [[False, False], [False, False], [False, False]],
  274. columns=["a", "dtype"],
  275. ),
  276. ),
  277. ],
  278. )
  279. def test_bool_ops_column_name_dtype(self, test_input, expected):
  280. # GH 22383 - .ne fails if columns containing column name 'dtype'
  281. result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]])
  282. tm.assert_frame_equal(result, expected)
  283. @pytest.mark.parametrize(
  284. "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv")
  285. )
  286. @pytest.mark.parametrize("axis", (0, 1))
  287. def test_frame_series_axis(self, axis, arith):
  288. # GH#26736 Dataframe.floordiv(Series, axis=1) fails
  289. df = self.frame
  290. if axis == 1:
  291. other = self.frame.iloc[0, :]
  292. else:
  293. other = self.frame.iloc[:, 0]
  294. expr._MIN_ELEMENTS = 0
  295. op_func = getattr(df, arith)
  296. expr.set_use_numexpr(False)
  297. expected = op_func(other, axis=axis)
  298. expr.set_use_numexpr(True)
  299. result = op_func(other, axis=axis)
  300. tm.assert_frame_equal(expected, result)
  301. @pytest.mark.parametrize(
  302. "op",
  303. [
  304. "__mod__",
  305. pytest.param("__rmod__", marks=pytest.mark.xfail(reason="GH-36552")),
  306. "__floordiv__",
  307. "__rfloordiv__",
  308. ],
  309. )
  310. @pytest.mark.parametrize("box", [DataFrame, Series, Index])
  311. @pytest.mark.parametrize("scalar", [-5, 5])
  312. def test_python_semantics_with_numexpr_installed(self, op, box, scalar):
  313. # https://github.com/pandas-dev/pandas/issues/36047
  314. expr._MIN_ELEMENTS = 0
  315. data = np.arange(-50, 50)
  316. obj = box(data)
  317. method = getattr(obj, op)
  318. result = method(scalar)
  319. # compare result with numpy
  320. expr.set_use_numexpr(False)
  321. expected = method(scalar)
  322. expr.set_use_numexpr(True)
  323. tm.assert_equal(result, expected)
  324. # compare result element-wise with Python
  325. for i, elem in enumerate(data):
  326. if box == DataFrame:
  327. scalar_result = result.iloc[i, 0]
  328. else:
  329. scalar_result = result[i]
  330. try:
  331. expected = getattr(int(elem), op)(scalar)
  332. except ZeroDivisionError:
  333. pass
  334. else:
  335. assert scalar_result == expected