object transformations
Functions available for DataFrame operations.
SQL transformations take a DataFrame as an argument and return a DataFrame. They are suitable arguments for the Dataset#transform method.
It's convenient to work with DataFrames that have snake_case column names. Column names with spaces make it harder to write SQL queries.
- Alphabetic
- By Inheritance
- transformations
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
bulkRegexpReplace(pattern: String = "\u0000", replacement: String = "")(df: DataFrame): DataFrame
Runs regexp_replace on all StringType columns in a DataFrame
Runs regexp_replace on all StringType columns in a DataFrame
val actualDF = sourceDF.transform( transformations.bulkRegexpReplace( "cool", "dude" ) )
Replaces all
"cool"strings in all thesourceDFcolumns ofStringTypewith the string"dude". - def camelCaseColumns()(df: DataFrame): DataFrame
-
def
camelCaseToSnakeCaseColumns()(df: DataFrame): DataFrame
Convert camel case columns to snake case Example: SomeColumn -> some_column
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native() @HotSpotIntrinsicCandidate()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
extractFromJson(colName: String, outputColName: String, path: String)(df: DataFrame): DataFrame
Extracts an object from a JSON field with a specified path expression
Extracts an object from a JSON field with a specified path expression
val sourceDF = spark.createDF( List( (10, """{"name": "Bart cool", "age": 25}"""), (20, """{"name": "Lisa frost", "age": 27}""") ), List( ("id", IntegerType, true), ("person", StringType, true) ) ) val actualDF = sourceDF.transform( transformations.extractFromJson("person", "name", "$.name") ) actualDF.show() +---+---------------------------------+----------------+ |id |person |name | +---+---------------------------------+----------------+ |10 |{"name": "Bart cool", "age": 25} |"Bart cool" | |20 |{"name": "Lisa frost", "age": 27}|"Lisa frost" | +---+---------------------------------+----------------+
-
def
extractFromJson(colName: String, outputColName: String, jsonSchema: StructType)(df: DataFrame): DataFrame
Extracts an object from a JSON field with a specified schema
Extracts an object from a JSON field with a specified schema
val sourceDF = spark.createDF( List( (10, """{"name": "Bart cool", "age": 25}"""), (20, """{"name": "Lisa frost", "age": 27}""") ), List( ("id", IntegerType, true), ("person", StringType, true) ) ) val personSchema = StructType(List( StructField("name", StringType), StructField("age", IntegerType) )) val actualDF = sourceDF.transform( transformations.extractFromJson("person", "personData", personSchema) ) actualDF.show() +---+---------------------------------+----------------+ |id |person |personData | +---+---------------------------------+----------------+ |10 |{"name": "Bart cool", "age": 25} |[Bart cool, 25] | |20 |{"name": "Lisa frost", "age": 27}|[Lisa frost, 27]| +---+---------------------------------+----------------+
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
def
modifyColumnNames(stringFun: (String) ⇒ String)(df: DataFrame): DataFrame
Changes all the column names in a DataFrame
-
def
multiRegexpReplace(cols: List[Column], pattern: String = "\u0000", replacement: String = "")(df: DataFrame): DataFrame
Runs regexp_replace on multiple columns
Runs regexp_replace on multiple columns
val actualDF = sourceDF.transform( transformations.multiRegexpReplace( List(col("person"), col("phone")), "cool", "dude" ) )
Replaces all
"cool"strings in thepersonandphonecolumns with the string"dude". -
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
- def prependToColName(str: String)(df: DataFrame): DataFrame
-
def
snakeCaseColumns()(df: DataFrame): DataFrame
snake_cases all the columns of a DataFrame spark-daria defines a
com.github.mrpowers.spark.daria.sql.transformations.snakeCaseColumnstransformation to convert all the column names to snake\_case.snake_cases all the columns of a DataFrame spark-daria defines a
com.github.mrpowers.spark.daria.sql.transformations.snakeCaseColumnstransformation to convert all the column names to snake\_case.import com.github.mrpowers.spark.daria.sql.transformations._
val sourceDf = Seq( ("funny", "joke") ).toDF("A b C", "de F") val actualDf = sourceDf.transform(snakeCaseColumns) actualDf.show() +-----+----+ |a_b_c|de_f| +-----+----+ |funny|joke| +-----+----+
-
def
snakifyColumns()(df: DataFrame): DataFrame
snakifies all the columns of a DataFrame
snakifies all the columns of a DataFrame
import com.github.mrpowers.spark.daria.sql.transformations._
val sourceDf = Seq( ("funny", "joke") ).toDF("ThIs", "BiH") val actualDf = sourceDf.transform(snakeCaseColumns) actualDf.show() +-----+----+ |th_is|bi_h| +-----+----+ |funny|joke| +-----+----+
-
def
sortColumns(order: String = "asc")(df: DataFrame): DataFrame
Sorts the columns of a DataFrame alphabetically The
sortColumnstransformation sorts the columns in a DataFrame alphabetically.Sorts the columns of a DataFrame alphabetically The
sortColumnstransformation sorts the columns in a DataFrame alphabetically.Suppose you start with the following
sourceDF:+-----+---+-----+ | name|age|sport| +-----+---+-----+ |pablo| 3| polo| +-----+---+-----+Run the code:
val actualDF = sourceDF.transform(sortColumns())Here’s the
actualDF:+---+-----+-----+ |age| name|sport| +---+-----+-----+ | 3|pablo| polo| +---+-----+-----+ -
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
titleCaseColumns()(df: DataFrame): DataFrame
Title Cases all the columns of a DataFrame
- def toSnakeCaseColumns()(df: DataFrame): DataFrame
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
def
truncateColumns(columnLengths: Map[String, Int])(df: DataFrame): DataFrame
Truncates multiple columns in a DataFrame
Truncates multiple columns in a DataFrame
val columnLengths: Map[String, Int] = Map( "person" -> 2, "phone" -> 3 ) sourceDF.transform( truncateColumns(columnLengths) )
Limits the
"person"column to 2 characters and the"phone"column to 3 characters. - def upperCaseColumns()(df: DataFrame): DataFrame
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
def
withColBucket(colName: String, outputColName: String, buckets: Array[(Any, Any)], inclusiveBoundries: Boolean = false, lowestBoundLte: Boolean = false, highestBoundGte: Boolean = false)(df: DataFrame): DataFrame
Categorizes a numeric column in various user specified "buckets"
-
def
withParquetCompatibleColumnNames()(df: DataFrame): DataFrame
Strips out invalid characters and replaces spaces with underscores to make Parquet compatible column names
- def withRowAsStruct(outputColName: String = "row_as_struct")(df: DataFrame): DataFrame
Deprecated Value Members
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] ) @Deprecated
- Deprecated