o

com.github.mrpowers.spark.daria.sql

transformations

object transformations

Functions available for DataFrame operations.

SQL transformations take a DataFrame as an argument and return a DataFrame. They are suitable arguments for the Dataset#transform method.

It's convenient to work with DataFrames that have snake_case column names. Column names with spaces make it harder to write SQL queries.

Linear Supertypes
AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. transformations
  2. AnyRef
  3. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  5. def bulkRegexpReplace(pattern: String = "\u0000", replacement: String = "")(df: DataFrame): DataFrame

    Runs regexp_replace on all StringType columns in a DataFrame

    Runs regexp_replace on all StringType columns in a DataFrame

    val actualDF = sourceDF.transform(
      transformations.bulkRegexpReplace(
        "cool",
        "dude"
      )
    )

    Replaces all "cool" strings in all the sourceDF columns of StringType with the string "dude".

  6. def camelCaseColumns()(df: DataFrame): DataFrame
  7. def camelCaseToSnakeCaseColumns()(df: DataFrame): DataFrame

    Convert camel case columns to snake case Example: SomeColumn -> some_column

  8. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native() @HotSpotIntrinsicCandidate()
  9. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  10. def equals(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  11. def extractFromJson(colName: String, outputColName: String, path: String)(df: DataFrame): DataFrame

    Extracts an object from a JSON field with a specified path expression

    Extracts an object from a JSON field with a specified path expression

    val sourceDF = spark.createDF(
      List(
        (10, """{"name": "Bart cool", "age": 25}"""),
        (20, """{"name": "Lisa frost", "age": 27}""")
      ), List(
        ("id", IntegerType, true),
        ("person", StringType, true)
      )
    )
    
    val actualDF = sourceDF.transform(
      transformations.extractFromJson("person", "name", "$.name")
    )
    
    actualDF.show()
    +---+---------------------------------+----------------+
    |id |person                           |name            |
    +---+---------------------------------+----------------+
    |10 |{"name": "Bart cool", "age": 25} |"Bart cool"     |
    |20 |{"name": "Lisa frost", "age": 27}|"Lisa frost"    |
    +---+---------------------------------+----------------+
  12. def extractFromJson(colName: String, outputColName: String, jsonSchema: StructType)(df: DataFrame): DataFrame

    Extracts an object from a JSON field with a specified schema

    Extracts an object from a JSON field with a specified schema

    val sourceDF = spark.createDF(
      List(
        (10, """{"name": "Bart cool", "age": 25}"""),
        (20, """{"name": "Lisa frost", "age": 27}""")
      ), List(
        ("id", IntegerType, true),
        ("person", StringType, true)
      )
    )
    
    val personSchema = StructType(List(
      StructField("name", StringType),
      StructField("age", IntegerType)
    ))
    
    val actualDF = sourceDF.transform(
      transformations.extractFromJson("person", "personData", personSchema)
    )
    
    actualDF.show()
    +---+---------------------------------+----------------+
    |id |person                           |personData      |
    +---+---------------------------------+----------------+
    |10 |{"name": "Bart cool", "age": 25} |[Bart cool, 25] |
    |20 |{"name": "Lisa frost", "age": 27}|[Lisa frost, 27]|
    +---+---------------------------------+----------------+
  13. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  14. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  15. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  16. def modifyColumnNames(stringFun: (String) ⇒ String)(df: DataFrame): DataFrame

    Changes all the column names in a DataFrame

  17. def multiRegexpReplace(cols: List[Column], pattern: String = "\u0000", replacement: String = "")(df: DataFrame): DataFrame

    Runs regexp_replace on multiple columns

    Runs regexp_replace on multiple columns

    val actualDF = sourceDF.transform(
      transformations.multiRegexpReplace(
        List(col("person"), col("phone")),
        "cool",
        "dude"
      )
    )

    Replaces all "cool" strings in the person and phone columns with the string "dude".

  18. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  19. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  20. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  21. def prependToColName(str: String)(df: DataFrame): DataFrame
  22. def snakeCaseColumns()(df: DataFrame): DataFrame

    snake_cases all the columns of a DataFrame spark-daria defines a com.github.mrpowers.spark.daria.sql.transformations.snakeCaseColumns transformation to convert all the column names to snake\_case.

    snake_cases all the columns of a DataFrame spark-daria defines a com.github.mrpowers.spark.daria.sql.transformations.snakeCaseColumns transformation to convert all the column names to snake\_case.

    import com.github.mrpowers.spark.daria.sql.transformations._

    val sourceDf = Seq(
      ("funny", "joke")
    ).toDF("A b C", "de F")
    
    val actualDf = sourceDf.transform(snakeCaseColumns)
    
    actualDf.show()
    
    +-----+----+
    |a_b_c|de_f|
    +-----+----+
    |funny|joke|
    +-----+----+
  23. def snakifyColumns()(df: DataFrame): DataFrame

    snakifies all the columns of a DataFrame

    snakifies all the columns of a DataFrame

    import com.github.mrpowers.spark.daria.sql.transformations._

    val sourceDf = Seq(
      ("funny", "joke")
    ).toDF("ThIs", "BiH")
    
    val actualDf = sourceDf.transform(snakeCaseColumns)
    
    actualDf.show()
    
    +-----+----+
    |th_is|bi_h|
    +-----+----+
    |funny|joke|
    +-----+----+
  24. def sortColumns(order: String = "asc")(df: DataFrame): DataFrame

    Sorts the columns of a DataFrame alphabetically The sortColumns transformation sorts the columns in a DataFrame alphabetically.

    Sorts the columns of a DataFrame alphabetically The sortColumns transformation sorts the columns in a DataFrame alphabetically.

    Suppose you start with the following sourceDF:

    +-----+---+-----+
    | name|age|sport|
    +-----+---+-----+
    |pablo|  3| polo|
    +-----+---+-----+

    Run the code:

    val actualDF = sourceDF.transform(sortColumns())

    Here’s the actualDF:

    +---+-----+-----+
    |age| name|sport|
    +---+-----+-----+
    |  3|pablo| polo|
    +---+-----+-----+
  25. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  26. def titleCaseColumns()(df: DataFrame): DataFrame

    Title Cases all the columns of a DataFrame

  27. def toSnakeCaseColumns()(df: DataFrame): DataFrame
  28. def toString(): String
    Definition Classes
    AnyRef → Any
  29. def truncateColumns(columnLengths: Map[String, Int])(df: DataFrame): DataFrame

    Truncates multiple columns in a DataFrame

    Truncates multiple columns in a DataFrame

    val columnLengths: Map[String, Int] = Map(
      "person" -> 2,
      "phone" -> 3
    )
    
    sourceDF.transform(
      truncateColumns(columnLengths)
    )

    Limits the "person" column to 2 characters and the "phone" column to 3 characters.

  30. def upperCaseColumns()(df: DataFrame): DataFrame
  31. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  32. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  33. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  34. def withColBucket(colName: String, outputColName: String, buckets: Array[(Any, Any)], inclusiveBoundries: Boolean = false, lowestBoundLte: Boolean = false, highestBoundGte: Boolean = false)(df: DataFrame): DataFrame

    Categorizes a numeric column in various user specified "buckets"

  35. def withParquetCompatibleColumnNames()(df: DataFrame): DataFrame

    Strips out invalid characters and replaces spaces with underscores to make Parquet compatible column names

  36. def withRowAsStruct(outputColName: String = "row_as_struct")(df: DataFrame): DataFrame

Deprecated Value Members

  1. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] ) @Deprecated
    Deprecated

Inherited from AnyRef

Inherited from Any

Ungrouped