diff --git a/rules/S7187/metadata.json b/rules/S7187/metadata.json new file mode 100644 index 0000000000..2c63c08510 --- /dev/null +++ b/rules/S7187/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S7187/python/metadata.json b/rules/S7187/python/metadata.json new file mode 100644 index 0000000000..08a4e81262 --- /dev/null +++ b/rules/S7187/python/metadata.json @@ -0,0 +1,25 @@ +{ + "title": "PySpark Pandas DataFrame columns should not use a reserved name", + "type": "CODE_SMELL", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + "data-science", + "pyspark" + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-7187", + "sqKey": "S7187", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "infeasible", + "code": { + "impacts": { + "RELIABILITY": "MEDIUM" + }, + "attribute": "CONVENTIONAL" + } +} diff --git a/rules/S7187/python/rule.adoc b/rules/S7187/python/rule.adoc new file mode 100644 index 0000000000..926bdbf50d --- /dev/null +++ b/rules/S7187/python/rule.adoc @@ -0,0 +1,42 @@ +This rule raises an issue when a PySpark Pandas DataFrame column name is set to a reserved name. + +== Why is this an issue? + +PySpark offers powerful APIs to work with Pandas DataFrames in a distributed environment. +While the integration between PySpark and Pandas is seamless, there are some caveats that should be taken into account. + +Spark Pandas API uses some special column names for internal purposes. +These column names contain leading `++__++` and trailing `++__++`. +Therefore, when using PySpark with Pandas and naming or renaming columns, +it is discouraged to use such reserved column names as they are not guaranteed to yield the expected results. + +== How to fix it + +To fix this issue provide a column name without leading and trailing `++__++`. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +import pyspark.pandas as ps + +df = ps.DataFrame({'__value__': [1, 2, 3]}) # Noncompliant: __value__ is a reserved column name +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +import pyspark.pandas as ps + +df = ps.DataFrame({'value': [1, 2, 3]}) # Compliant +---- + + +== Resources +=== Documentation + +* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/best_practices.html#avoid-reserved-column-names[Best Practices] +