From c046fc94c4d49709ccf5aa7adc7d8c1d06af2db9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 19 Feb 2025 10:58:21 +0000 Subject: [PATCH] Create rule S7195: PySpark lit(None) should be used when populating empty columns (#4638) --- rules/S7195/metadata.json | 2 + rules/S7195/python/metadata.json | 25 ++++++++++++ rules/S7195/python/rule.adoc | 66 ++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) create mode 100644 rules/S7195/metadata.json create mode 100644 rules/S7195/python/metadata.json create mode 100644 rules/S7195/python/rule.adoc diff --git a/rules/S7195/metadata.json b/rules/S7195/metadata.json new file mode 100644 index 0000000000..2c63c08510 --- /dev/null +++ b/rules/S7195/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json new file mode 100644 index 0000000000..d858e9c665 --- /dev/null +++ b/rules/S7195/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "PySpark lit(None) should be used when populating empty columns", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + "data-science", + "pyspark" + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7195", + "sqKey": "S7195", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "unknown", + "code": { + "impacts": { + "RELIABILITY": "MEDIUM" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc new file mode 100644 index 0000000000..e8edfd9af2 --- /dev/null +++ b/rules/S7195/python/rule.adoc @@ -0,0 +1,66 @@ +This rule raises an issue when a column of a PySpark DataFrame is populated with `lit('')`. + +== Why is this an issue? + +In PySpark, when populating a DataFrame column with empty or null values, it is recommended to use `lit(None)`. +Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies. + +The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values. +Using `lit(None)` also preserves the ability to use functions such as `isnull` or `isnotnull` to check for null values in the DataFrame. + +== How to fix it + +To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame column with empty/null values. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +from pyspark.sql import SparkSession +from pyspark.sql.functions import lit + +spark = SparkSession.builder.appName("Example").getOrCreate() + +data = [ + (1, "Alice"), + (2, "Bob"), + (3, "Charlie") +] + +df = spark.createDataFrame(data, ["id", "name"]) + +df_with_empty_column = df.withColumn("middle_name", lit('')) # Noncompliant: usage of lit('') to represent en empty value +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +from pyspark.sql import SparkSession +from pyspark.sql.functions import lit + +spark = SparkSession.builder.appName("Example").getOrCreate() + +data = [ + (1, "Alice"), + (2, "Bob"), + (3, "Charlie") +] + +df = spark.createDataFrame(data, ["id", "name"]) + +df_with_empty_column = df.withColumn("middle_name", lit(None)) # Compliant +---- + +== Resources +=== Documentation + +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark-sql-functions-lit[pyspark-sql-functions-lit] +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark-sql-functions-isnull[pyspark-sql-functions-isnull] + +=== Standards + +* Palantir PySpark Style Guide - https://github.com/palantir/pyspark-style-guide?tab=readme-ov-file#empty-columns[empty-columns] +