From 9d7de6d39d9be8e660252ecefdaef4924a0a6ce3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 11:20:42 +0100 Subject: [PATCH] Create rule S7182: The subset argument should be provided when using PySpark DataFrame dropDuplicates (#4615) * Create Rule S7182: The `subset` argument should be provided when using PySpark DataFrame `dropDuplicates` --------- Co-authored-by: joke1196 Co-authored-by: David Kunzmann --- rules/S7182/metadata.json | 2 + rules/S7182/python/metadata.json | 26 ++++++ rules/S7182/python/rule.adoc | 134 +++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+) create mode 100644 rules/S7182/metadata.json create mode 100644 rules/S7182/python/metadata.json create mode 100644 rules/S7182/python/rule.adoc diff --git a/rules/S7182/metadata.json b/rules/S7182/metadata.json new file mode 100644 index 0000000000..2c63c08510 --- /dev/null +++ b/rules/S7182/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7182/python/metadata.json b/rules/S7182/python/metadata.json new file mode 100644 index 0000000000..c20dc26864 --- /dev/null +++ b/rules/S7182/python/metadata.json @@ -0,0 +1,26 @@ +{ + "title": "The \"subset\" argument should be provided when using PySpark DataFrame \"dropDuplicates\" method", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + "pyspark", + "data-science" + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7182", + "sqKey": "S7182", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "partial", + "code": { + "impacts": { + "MAINTAINABILITY": "MEDIUM", + "RELIABILITY": "MEDIUM" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7182/python/rule.adoc b/rules/S7182/python/rule.adoc new file mode 100644 index 0000000000..99688f8e8f --- /dev/null +++ b/rules/S7182/python/rule.adoc @@ -0,0 +1,134 @@ +This rule raises an issue when no value is provided to the `subset` parameter of PySpark DataFrame's `dropDuplicates` method. + +== Why is this an issue? + +In PySpark, the `dropDuplicates` method is used to remove duplicate rows from a DataFrame. +By default, if no column names are provided, `dropDuplicates` will consider all columns to identify duplicates. + +This default is defensive and avoids removing rows that are partially similar, but it can also lead to: + + * unintended results. The simplest example would be to try removing duplicates on a DataFrame that holds +a unique id per row. It is easy to forget that an id is part of a DataFrame and when trying to remove duplicates, the output DataFrame is the same as the input DataFrame. +For example, applying `dropDuplicates` on the following DataFrame will not remove any rows: + +[source, text] +---- ++---+-----+---+ +| id| name|age| ++---+-----+---+ +| 1|Alice| 29| +| 2| Bob| 29| +| 3|Alice| 29| +| 4|Alice| 30| +| 5| Bob| 29| ++---+-----+---+ +---- + + * performance inefficiencies. Identifying duplicates is a very costly operation, as Spark has to compare each column of each row with each other. + +To ensure clarity, prevent incorrect results, and optimize performance, +it is a good practice to specify the column names when using `dropDuplicates`. + +This rule will raise issues on `pyspark.sql.DataFrame.dropDuplicates`, `pyspark.sql.DataFrame.drop_duplicates` +and `pyspark.sql.DataFrame.dropDuplicatesWithinWaterMark`. + +=== Exceptions + +If however, the intent is to remove duplicates based on all columns, the `distinct` method can be used, or +the `None` value can be provided to the `subset` parameter. This way the intention is clear and this rule will not raise any issues. + + +[source,python] +---- +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +data = ... + +df = spark.createDataFrame(data, ["id", "name", "age"]) + +df_dedup = df.dropDuplicates(None) # Compliant +df_dedup = df.dropDuplicates(subset=None) # Compliant +df_dedup = df.distinct() # Compliant + +---- + +== How to fix it + +To fix this issue, provide the column names to the `subset` parameter of the `dropDuplicates` method or use the `distinct` method instead. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +data = [ + (1, "Alice", 29), + (2, "Bob", 29), + (3, "Alice", 29), + (4, "Alice", 30), + (5, "Bob", 29) +] +df = spark.createDataFrame(data, ["id", "name", "age"]) + +df_dedup = df.dropDuplicates() # Non-compliant: No column names are specified +---- + +The above code example result in no rows being removed: + +[cols="1,3,1"] +|=== +|id |name | age + +| 1|Alice| 29 +| 2| Bob| 29 +| 3|Alice| 29 +| 4|Alice| 30 +| 5| Bob| 29 + +|=== + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +from pyspark.sql import SparkSession + +spark = SparkSession.builder.getOrCreate() +data = [ + (1, "Alice", 29), + (2, "Bob", 29), + (3, "Alice", 29), + (4, "Alice", 30), + (5, "Bob", 29) +] +df = spark.createDataFrame(data, ["id", "name", "age"]) + +df_dedup = df.dropDuplicates(subset=["name", "age"]) # Compliant +---- + +In this example duplicates are removed based on the `name` and `age` columns: + +[cols="1,3,1"] +|=== +|id |name | age + +| 1|Alice| 29 +| 2| Bob| 29 +| 4|Alice| 30 + +|=== + +== Resources +=== Documentation + + * PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.dropDuplicates.html[pyspark.sql.DataFrame.dropDuplicates] + +=== Articles & blog posts + + * stratascratch blog - https://www.stratascratch.com/blog/how-to-drop-duplicates-in-pyspark/[How to drop duplicates in PySpark] + * Medium blog - https://medium.com/@santosh_beora/distinct-and-dropduplicates-in-pyspark-fedb1e9e8738[distinct() and dropDuplicates() in PySpark]