## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.## mypy: disable-error-code="empty-body"importsysfromtypingimport(overload,Any,TYPE_CHECKING,Union,)frompyspark.sql.tvf_argumentimportTableValuedFunctionArgumentfrompyspark.sql.utilsimportdispatch_col_methodfrompyspark.sql.typesimportDataTypefrompyspark.errorsimportPySparkValueErrorifTYPE_CHECKING:frompyspark.sql._typingimportLiteralType,DecimalLiteral,DateTimeLiteralfrompyspark.sql.windowimportWindowSpec__all__=["Column"]
[docs]classColumn(TableValuedFunctionArgument):""" A column in a DataFrame. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- Column instances can be created by >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) Select a column out of a DataFrame >>> df.name Column<'name'> >>> df["name"] Column<'name'> Create from an expression >>> df.age + 1 Column<...> >>> 1 / df.age Column<...> """# HACK ALERT!! this is to reduce the backward compatibility concern, and returns# Spark Classic Column by default. This is NOT an API, and NOT supposed to# be directly invoked. DO NOT use this constructor.def__new__(cls,*args:Any,**kwargs:Any)->"Column":frompyspark.sql.classic.columnimportColumnreturnColumn.__new__(Column,*args,**kwargs)# arithmetic operators@dispatch_col_methoddef__neg__(self)->"Column":...@dispatch_col_methoddef__add__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__sub__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__mul__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__div__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__truediv__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__mod__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__radd__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rsub__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rmul__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rdiv__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rtruediv__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rmod__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__pow__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__rpow__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...# logistic operators@dispatch_col_methoddef__eq__(# type: ignore[override]self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"],)->"Column":"""binary function"""...@dispatch_col_methoddef__ne__(# type: ignore[override]self,other:Any,)->"Column":"""binary function"""...@dispatch_col_methoddef__lt__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__le__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__ge__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__gt__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...
# `and`, `or`, `not` cannot be overloaded in Python,# so use bitwise operators as boolean operators@dispatch_col_methoddef__and__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__or__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__invert__(self)->"Column":...@dispatch_col_methoddef__rand__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...@dispatch_col_methoddef__ror__(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":...# container operators@dispatch_col_methoddef__contains__(self,item:Any)->None:raisePySparkValueError(errorClass="CANNOT_APPLY_IN_FOR_COLUMN",messageParameters={},)# bitwise operators
[docs]@dispatch_col_methoddefbitwiseOR(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" " Compute bitwise OR of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise or(|) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseOR(df.b)).collect() [Row((a | b)=235)] """...
[docs]@dispatch_col_methoddefbitwiseAND(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" Compute bitwise AND of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise and(&) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseAND(df.b)).collect() [Row((a & b)=10)] """...
[docs]@dispatch_col_methoddefbitwiseXOR(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" Compute bitwise XOR of this expression with another expression. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other a value or :class:`Column` to calculate bitwise xor(^) with this :class:`Column`. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(a=170, b=75)]) >>> df.select(df.a.bitwiseXOR(df.b)).collect() [Row((a ^ b)=225)] """...
[docs]@dispatch_col_methoddefgetItem(self,key:Any)->"Column":""" An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- key a literal value, or a :class:`Column` expression. The result will only be true at a location if the item matches in the column. .. deprecated:: 3.0.0 :class:`Column` as a parameter is deprecated. Returns ------- :class:`Column` Column representing the item(s) got at position out of a list or by key out of a dict. Examples -------- >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"]) >>> df.select(df.l.getItem(0), df.d.getItem("key")).show() +----+------+ |l[0]|d[key]| +----+------+ | 1| value| +----+------+ """...
[docs]@dispatch_col_methoddefgetField(self,name:Any)->"Column":""" An expression that gets a field by name in a :class:`StructType`. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- name a literal value, or a :class:`Column` expression. The result will only be true at a location if the field matches in the Column. .. deprecated:: 3.0.0 :class:`Column` as a parameter is deprecated. Returns ------- :class:`Column` Column representing whether each element of Column got by name. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))]) >>> df.select(df.r.getField("b")).show() +---+ |r.b| +---+ | b| +---+ >>> df.select(df.r.a).show() +---+ |r.a| +---+ | 1| +---+ """...
[docs]@dispatch_col_methoddefwithField(self,fieldName:str,col:"Column")->"Column":""" An expression that adds/replaces a field in :class:`StructType` by name. .. versionadded:: 3.1.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- fieldName : str a literal value. The result will only be true at a location if any field matches in the Column. col : :class:`Column` A :class:`Column` expression for the column with `fieldName`. Returns ------- :class:`Column` Column representing whether each element of Column which field was added/replaced by fieldName. Examples -------- >>> from pyspark.sql import Row >>> from pyspark.sql.functions import lit >>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))]) >>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show() +---+ | b| +---+ | 3| +---+ >>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show() +---+ | d| +---+ | 4| +---+ """...
[docs]@dispatch_col_methoddefdropFields(self,*fieldNames:str)->"Column":""" An expression that drops fields in :class:`StructType` by name. This is a no-op if the schema doesn't contain field name(s). .. versionadded:: 3.1.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- fieldNames : str Desired field names (collects all positional arguments passed) The result will drop at a location if any field matches in the Column. Returns ------- :class:`Column` Column representing whether each element of Column with field dropped by fieldName. Examples -------- >>> from pyspark.sql import Row >>> from pyspark.sql.functions import col, lit >>> df = spark.createDataFrame([ ... Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))]) >>> df.withColumn('a', df['a'].dropFields('b')).show() +-----------------+ | a| +-----------------+ |{2, 3, {4, 5, 6}}| +-----------------+ >>> df.withColumn('a', df['a'].dropFields('b', 'c')).show() +--------------+ | a| +--------------+ |{3, {4, 5, 6}}| +--------------+ This method supports dropping multiple nested fields directly e.g. >>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show() +--------------+ | a| +--------------+ |{1, 2, 3, {4}}| +--------------+ However, if you are going to add/replace multiple nested fields, it is preferred to extract out the nested struct before adding/replacing multiple fields e.g. >>> df.select(col("a").withField( ... "e", col("a.e").dropFields("g", "h")).alias("a") ... ).show() +--------------+ | a| +--------------+ |{1, 2, 3, {4}}| +--------------+ """...
[docs]@dispatch_col_methoddef__getattr__(self,item:Any)->"Column":""" An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- item a literal value. Returns ------- :class:`Column` Column representing the item got by key out of a dict. Examples -------- >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"]) >>> df.select(df.d.key).show() +------+ |d[key]| +------+ | value| +------+ """...
[docs]@dispatch_col_methoddef__getitem__(self,k:Any)->"Column":""" An expression that gets an item at position ``ordinal`` out of a list, or gets an item by key out of a dict. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- k a literal value, or a slice object without step. Returns ------- :class:`Column` Column representing the item got by key out of a dict, or substrings sliced by the given slice object. Examples -------- >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"]) >>> df.select(df.l[slice(1, 3)], df.d['key']).show() +---------------+------+ |substr(l, 1, 3)|d[key]| +---------------+------+ | abc| value| +---------------+------+ """...
[docs]@dispatch_col_methoddefcontains(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" Contains the other element. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other string in line. A value as a literal or a :class:`Column`. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.contains('o')).collect() [Row(age=5, name='Bob')] """...
[docs]@dispatch_col_methoddefstartswith(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" String starts with. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : :class:`Column` or str string at start of line (do not use a regex `^`) Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.startswith('Al')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.startswith('^Al')).collect() [] """...
[docs]@dispatch_col_methoddefendswith(self,other:Union["Column","LiteralType","DecimalLiteral","DateTimeLiteral"])->"Column":""" String ends with. Returns a boolean :class:`Column` based on a string match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : :class:`Column` or str string at end of line (do not use a regex `$`) Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.endswith('ice')).collect() [Row(age=2, name='Alice')] >>> df.filter(df.name.endswith('ice$')).collect() [] """...
[docs]@dispatch_col_methoddeflike(self:"Column",other:str)->"Column":""" SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str a SQL LIKE pattern See Also -------- pyspark.sql.Column.rlike Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by SQL LIKE pattern. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.like('Al%')).collect() [Row(age=2, name='Alice')] """...
[docs]@dispatch_col_methoddefrlike(self:"Column",other:str)->"Column":""" SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex match. .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str an extended regex expression Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by extended regex expression. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.rlike('ice$')).collect() [Row(age=2, name='Alice')] """...
[docs]@dispatch_col_methoddefilike(self:"Column",other:str)->"Column":""" SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column` based on a case insensitive match. .. versionadded:: 3.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- other : str a SQL LIKE pattern See Also -------- pyspark.sql.Column.rlike Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is matched by SQL LIKE pattern. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.filter(df.name.ilike('%Ice')).collect() [Row(age=2, name='Alice')] """...
[docs]@dispatch_col_methoddefsubstr(self,startPos:Union[int,"Column"],length:Union[int,"Column"])->"Column":""" Return a :class:`Column` which is a substring of the column. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- startPos : :class:`Column` or int start position length : :class:`Column` or int length of the substring Returns ------- :class:`Column` Column representing whether each element of Column is substr of origin Column. Examples -------- Example 1. Using integers for the input arguments. >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name.substr(1, 3).alias("col")).collect() [Row(col='Ali'), Row(col='Bob')] Example 2. Using columns for the input arguments. >>> df = spark.createDataFrame( ... [(3, 4, "Alice"), (2, 3, "Bob")], ["sidx", "eidx", "name"]) >>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect() [Row(col='ice'), Row(col='ob')] """...
[docs]@dispatch_col_methoddefisin(self,*cols:Any)->"Column":""" A boolean expression that is evaluated to true if the value of this expression is contained by the evaluated values of the arguments. .. versionadded:: 1.5.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- cols : Any The values to compare with the column values. The result will only be true at a location if any value matches in the Column. Returns ------- :class:`Column` Column of booleans showing whether each element in the Column is contained in cols. Examples -------- >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob"), (8, "Mike")], ["age", "name"]) Example 1: Filter rows with names in the specified values >>> df[df.name.isin("Bob", "Mike")].show() +---+----+ |age|name| +---+----+ | 5| Bob| | 8|Mike| +---+----+ Example 2: Filter rows with ages in the specified list >>> df[df.age.isin([1, 2, 3])].show() +---+-----+ |age| name| +---+-----+ | 2|Alice| +---+-----+ Example 3: Filter rows with names not in the specified values >>> df[~df.name.isin("Alice", "Bob")].show() +---+----+ |age|name| +---+----+ | 8|Mike| +---+----+ """...
# order
[docs]@dispatch_col_methoddefasc(self)->"Column":""" Returns a sort expression based on the ascending order of the column. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc()).collect() [Row(name='Alice'), Row(name='Tom')] """...
[docs]@dispatch_col_methoddefasc_nulls_first(self)->"Column":""" Returns a sort expression based on ascending order of the column, and null values return before non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect() [Row(name=None), Row(name='Alice'), Row(name='Tom')] """...
[docs]@dispatch_col_methoddefasc_nulls_last(self)->"Column":""" Returns a sort expression based on ascending order of the column, and null values appear after non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect() [Row(name='Alice'), Row(name='Tom'), Row(name=None)] """...
[docs]@dispatch_col_methoddefdesc(self)->"Column":""" Returns a sort expression based on the descending order of the column. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc()).collect() [Row(name='Tom'), Row(name='Alice')] """...
[docs]@dispatch_col_methoddefdesc_nulls_first(self)->"Column":""" Returns a sort expression based on the descending order of the column, and null values appear before non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect() [Row(name=None), Row(name='Tom'), Row(name='Alice')] """...
[docs]@dispatch_col_methoddefdesc_nulls_last(self)->"Column":""" Returns a sort expression based on the descending order of the column, and null values appear after non-null values. .. versionadded:: 2.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"]) >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect() [Row(name='Tom'), Row(name='Alice'), Row(name=None)] """...
[docs]@dispatch_col_methoddefisNull(self)->"Column":""" True if the current expression is null. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNull()).collect() [Row(name='Alice', height=None)] """...
[docs]@dispatch_col_methoddefisNotNull(self)->"Column":""" True if the current expression is NOT null. .. versionchanged:: 3.4.0 Supports Spark Connect. Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)]) >>> df.filter(df.height.isNotNull()).collect() [Row(name='Tom', height=80)] """...
[docs]@dispatch_col_methoddefisNaN(self)->"Column":""" True if the current expression is NaN. .. versionadded:: 4.0.0 Examples -------- >>> from pyspark.sql import Row >>> df = spark.createDataFrame( ... [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))]) >>> df.filter(df.height.isNaN()).collect() [Row(name='Alice', height=nan)] """...
[docs]@dispatch_col_methoddefalias(self,*alias:str,**kwargs:Any)->"Column":""" Returns this column aliased with a new name or names (in the case of expressions that return more than one column, such as explode). .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- alias : str desired column names (collects all positional arguments passed) Other Parameters ---------------- metadata: dict a dict of information to be stored in ``metadata`` attribute of the corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword only argument) .. versionchanged:: 2.2.0 Added optional ``metadata`` argument. Returns ------- :class:`Column` Column representing whether each element of Column is aliased with new name or names. Examples -------- >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.alias("age2")).collect() [Row(age2=2), Row(age2=5)] >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max'] 99 """...
[docs]@dispatch_col_methoddefname(self,*alias:str,**kwargs:Any)->"Column":""" :func:`name` is an alias for :func:`alias`. .. versionadded:: 2.0.0 """...
[docs]@dispatch_col_methoddefcast(self,dataType:Union[DataType,str])->"Column":""" Casts the column into type ``dataType``. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- dataType : :class:`DataType` or str a DataType or Python string literal with a DDL-formatted string to use when parsing the column to the same type. Returns ------- :class:`Column` Column representing whether each element of Column is cast into new type. Examples -------- >>> from pyspark.sql.types import StringType >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages='2'), Row(ages='5')] """...
[docs]@dispatch_col_methoddeftry_cast(self,dataType:Union[DataType,str])->"Column":""" This is a special version of `cast` that performs the same operation, but returns a NULL value instead of raising an error if the invoke method throws exception. .. versionadded:: 4.0.0 Parameters ---------- dataType : :class:`DataType` or str a DataType or Python string literal with a DDL-formatted string to use when parsing the column to the same type. Returns ------- :class:`Column` Column representing whether each element of Column is cast into new type. Examples -------- Example 1: Cast with a Datatype >>> from pyspark.sql.types import LongType >>> df = spark.createDataFrame( ... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"]) >>> df.select(df.name.try_cast(LongType())).show() +----+ |name| +----+ | 123| |NULL| |NULL| +----+ Example 2: Cast with a DDL string >>> df = spark.createDataFrame( ... [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"]) >>> df.select(df.name.try_cast("double")).show() +-----+ | name| +-----+ |123.0| | NULL| | NULL| +-----+ """...
[docs]@dispatch_col_methoddefastype(self,dataType:Union[DataType,str])->"Column":""" :func:`astype` is an alias for :func:`cast`. .. versionadded:: 1.4.0 """...
[docs]@dispatch_col_methoddefbetween(self,lowerBound:Union["Column","LiteralType","DateTimeLiteral","DecimalLiteral"],upperBound:Union["Column","LiteralType","DateTimeLiteral","DecimalLiteral"],)->"Column":""" Check if the current column's values are between the specified lower and upper bounds, inclusive. .. versionadded:: 1.3.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal The lower boundary value, inclusive. upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal The upper boundary value, inclusive. Returns ------- :class:`Column` A new column of boolean values indicating whether each element in the original column is within the specified range (inclusive). Examples -------- Using between with integer values. >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, df.age.between(2, 4)).show() +-----+---------------------------+ | name|((age >= 2) AND (age <= 4))| +-----+---------------------------+ |Alice| true| | Bob| false| +-----+---------------------------+ Using between with string values. >>> df = spark.createDataFrame([("Alice", "A"), ("Bob", "B")], ["name", "initial"]) >>> df.select(df.name, df.initial.between("A", "B")).show() +-----+-----------------------------------+ | name|((initial >= A) AND (initial <= B))| +-----+-----------------------------------+ |Alice| true| | Bob| true| +-----+-----------------------------------+ Using between with float values. >>> df = spark.createDataFrame( ... [(2.5, "Alice"), (5.5, "Bob")], ["height", "name"]) >>> df.select(df.name, df.height.between(2.0, 5.0)).show() +-----+-------------------------------------+ | name|((height >= 2.0) AND (height <= 5.0))| +-----+-------------------------------------+ |Alice| true| | Bob| false| +-----+-------------------------------------+ Using between with date values. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame( ... [("Alice", "2023-01-01"), ("Bob", "2023-02-01")], ["name", "date"]) >>> df = df.withColumn("date", sf.to_date(df.date)) >>> df.select(df.name, df.date.between("2023-01-01", "2023-01-15")).show() +-----+-----------------------------------------------+ | name|((date >= 2023-01-01) AND (date <= 2023-01-15))| +-----+-----------------------------------------------+ |Alice| true| | Bob| false| +-----+-----------------------------------------------+ >>> from datetime import date >>> df.select(df.name, df.date.between(date(2023, 1, 1), date(2023, 1, 15))).show() +-----+-------------------------------------------------------------+ | name|((date >= DATE '2023-01-01') AND (date <= DATE '2023-01-15'))| +-----+-------------------------------------------------------------+ |Alice| true| | Bob| false| +-----+-------------------------------------------------------------+ Using between with timestamp values. >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame( ... [("Alice", "2023-01-01 10:00:00"), ("Bob", "2023-02-01 10:00:00")], ... schema=["name", "timestamp"]) >>> df = df.withColumn("timestamp", sf.to_timestamp(df.timestamp)) >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01")).show() +-----+---------------------------------------------------------+ | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01))| +-----+---------------------------------------------------------+ |Alice| true| | Bob| false| +-----+---------------------------------------------------------+ >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01 12:00:00")).show() +-----+------------------------------------------------------------------+ | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01 12:00:00))| +-----+------------------------------------------------------------------+ |Alice| true| | Bob| true| +-----+------------------------------------------------------------------+ """...
[docs]@dispatch_col_methoddefwhen(self,condition:"Column",value:Any)->"Column":""" Evaluates a list of conditions and returns one of multiple possible result expressions. If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. .. versionadded:: 1.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- condition : :class:`Column` a boolean :class:`Column` expression. value a literal value, or a :class:`Column` expression. Returns ------- :class:`Column` Column representing whether each element of Column is in conditions. Examples -------- Example 1: Using :func:`when` with conditions and values to create a new Column >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> result = df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)) >>> result.show() +-----+------------------------------------------------------------+ | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END| +-----+------------------------------------------------------------+ |Alice| -1| | Bob| 1| +-----+------------------------------------------------------------+ Example 2: Chaining multiple :func:`when` conditions >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(1, "Alice"), (4, "Bob"), (6, "Charlie")], ["age", "name"]) >>> result = df.select( ... df.name, ... sf.when(df.age < 3, "Young").when(df.age < 5, "Middle-aged").otherwise("Old") ... ) >>> result.show() +-------+---------------------------------------------------------------------------+ | name|CASE WHEN (age < 3) THEN Young WHEN (age < 5) THEN Middle-aged ELSE Old END| +-------+---------------------------------------------------------------------------+ | Alice| Young| | Bob| Middle-aged| |Charlie| Old| +-------+---------------------------------------------------------------------------+ Example 3: Using literal values as conditions >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> result = df.select( ... df.name, sf.when(sf.lit(True), 1).otherwise( ... sf.raise_error("unreachable")).alias("when")) >>> result.show() +-----+----+ | name|when| +-----+----+ |Alice| 1| | Bob| 1| +-----+----+ See Also -------- pyspark.sql.functions.when """...
[docs]@dispatch_col_methoddefotherwise(self,value:Any)->"Column":""" Evaluates a list of conditions and returns one of multiple possible result expressions. If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions. .. versionadded:: 1.4.0 .. versionchanged:: 3.4.0 Supports Spark Connect. Parameters ---------- value a literal value, or a :class:`Column` expression. Returns ------- :class:`Column` Column representing whether each element of Column is unmatched conditions. Examples -------- >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show() +-----+-------------------------------------+ | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END| +-----+-------------------------------------+ |Alice| 0| | Bob| 1| +-----+-------------------------------------+ See Also -------- pyspark.sql.functions.when """...
[docs]@dispatch_col_methoddefouter(self)->"Column":""" Mark this column as an outer column if its expression refers to columns from an outer query. This is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it to build subquery expressions. Spark Connect DataFrame is always lazily analyzed and does not need to use this function. .. versionadded:: 4.0.0 See Also -------- pyspark.sql.dataframe.DataFrame.scalar pyspark.sql.dataframe.DataFrame.exists """...