From c046fc94c4d49709ccf5aa7adc7d8c1d06af2db9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2025 10:58:21 +0000
Subject: [PATCH] Create rule S7195: PySpark lit(None) should be used when
 populating empty columns (#4638)

---
 rules/S7195/metadata.json        |  2 +
 rules/S7195/python/metadata.json | 25 ++++++++++++
 rules/S7195/python/rule.adoc     | 66 ++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+)
 create mode 100644 rules/S7195/metadata.json
 create mode 100644 rules/S7195/python/metadata.json
 create mode 100644 rules/S7195/python/rule.adoc

diff --git a/rules/S7195/metadata.json b/rules/S7195/metadata.json
new file mode 100644
index 0000000000..2c63c08510
--- /dev/null
+++ b/rules/S7195/metadata.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/rules/S7195/python/metadata.json b/rules/S7195/python/metadata.json
new file mode 100644
index 0000000000..d858e9c665
--- /dev/null
+++ b/rules/S7195/python/metadata.json
@@ -0,0 +1,25 @@
+{
+  "title": "PySpark lit(None) should be used when populating empty columns",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+    "data-science",
+    "pyspark"
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-7195",
+  "sqKey": "S7195",
+  "scope": "All",
+  "defaultQualityProfiles": ["Sonar way"],
+  "quickfix": "unknown",
+  "code": {
+    "impacts": {
+      "RELIABILITY": "MEDIUM"
+    },
+    "attribute": "CONVENTIONAL"
+  }
+}
diff --git a/rules/S7195/python/rule.adoc b/rules/S7195/python/rule.adoc
new file mode 100644
index 0000000000..e8edfd9af2
--- /dev/null
+++ b/rules/S7195/python/rule.adoc
@@ -0,0 +1,66 @@
+This rule raises an issue when a column of a PySpark DataFrame is populated with `lit('')`.
+
+== Why is this an issue?
+
+In PySpark, when populating a DataFrame column with empty or null values, it is recommended to use `lit(None)`. 
+Using literals such as `lit('')` as a placeholder for absent values can lead to data misinterpretation and inconsistencies.
+
+The usage of `lit(None)` ensures clarity and consistency in the codebase, making it explicit that the column is intentionally populated with null values.
+Using `lit(None)` also preserves the ability to use functions such as `isnull` or `isnotnull` to check for null values in the DataFrame.
+
+== How to fix it
+
+To fix this issue, replace `lit('')` with `lit(None)` when populating a DataFrame column with empty/null values.
+
+=== Code examples
+
+==== Noncompliant code example
+
+[source,python,diff-id=1,diff-type=noncompliant]
+----
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import lit
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data = [
+    (1, "Alice"),
+    (2, "Bob"),
+    (3, "Charlie")
+]
+
+df = spark.createDataFrame(data, ["id", "name"])
+
+df_with_empty_column = df.withColumn("middle_name", lit('')) # Noncompliant: usage of lit('') to represent en empty value
+----
+
+==== Compliant solution
+
+[source,python,diff-id=1,diff-type=compliant]
+----
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import lit
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data = [
+    (1, "Alice"),
+    (2, "Bob"),
+    (3, "Charlie")
+]
+
+df = spark.createDataFrame(data, ["id", "name"])
+
+df_with_empty_column = df.withColumn("middle_name", lit(None)) # Compliant
+----
+
+== Resources
+=== Documentation
+
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark-sql-functions-lit[pyspark-sql-functions-lit]
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark-sql-functions-isnull[pyspark-sql-functions-isnull]
+
+=== Standards
+
+* Palantir PySpark Style Guide - https://github.com/palantir/pyspark-style-guide?tab=readme-ov-file#empty-columns[empty-columns]
+