diff --git a/rules/S6971/metadata.json b/rules/S6971/metadata.json new file mode 100644 index 0000000000..2c63c08510 --- /dev/null +++ b/rules/S6971/metadata.json @@ -0,0 +1,2 @@ +{ +} diff --git a/rules/S6971/python/metadata.json b/rules/S6971/python/metadata.json new file mode 100644 index 0000000000..9229df7782 --- /dev/null +++ b/rules/S6971/python/metadata.json @@ -0,0 +1,24 @@ +{ + "title": "Transformers should not be accessed directly when a Scikit-Learn Pipeline uses caching", + "type": "BUG", + "status": "ready", + "remediation": { + "func": "Constant\/Issue", + "constantCost": "5min" + }, + "tags": [ + ], + "defaultSeverity": "Major", + "ruleSpecification": "RSPEC-6971", + "sqKey": "S6971", + "scope": "All", + "defaultQualityProfiles": ["Sonar way"], + "quickfix": "targeted", + "code": { + "impacts": { + "MAINTAINABILITY": "MEDIUM", + "RELIABILITY": "HIGH" + }, + "attribute": "LOGICAL" + } +} diff --git a/rules/S6971/python/rule.adoc b/rules/S6971/python/rule.adoc new file mode 100644 index 0000000000..be14b4c223 --- /dev/null +++ b/rules/S6971/python/rule.adoc @@ -0,0 +1,76 @@ +This rule raises an issue when trying to access a Scikit-Learn transformer used in a pipeline with caching directly. + +== Why is this an issue? + +When using a pipeline with a cache and passing the transformer objects as an instance from a variable, it is possible to access the transformer objects directly. + +This is an issue since all the transformers are cloned when the Pipeline is fitted, and therefore, the objects outside the Pipeline are not updated and will yield unexpected results. + +== How to fix it + +Replace the direct access to the transformer with an access to the `named_steps` attribute of the pipeline. + +=== Code examples + +==== Noncompliant code example + +[source,python,diff-id=1,diff-type=noncompliant] +---- +from sklearn.datasets import load_diabetes +from sklearn.preprocessing import RobustScaler +from sklearn.neighbors import KNeighborsRegressor +from sklearn.pipeline import Pipeline + +diabetes = load_diabetes() +scaler = RobustScaler() +knn = KNeighborsRegressor(n_neighbors=5) + +pipeline = Pipeline([ + ('scaler', scaler), + ('knn', knn) +], memory="cache") + +pipeline.fit(diabetes.data, diabetes.target) +print(scaler.center_) # Noncompliant : raises an AttributeError + +---- + +==== Compliant solution + +[source,python,diff-id=1,diff-type=compliant] +---- +from sklearn.datasets import load_diabetes +from sklearn.preprocessing import RobustScaler +from sklearn.neighbors import KNeighborsRegressor +from sklearn.pipeline import Pipeline + +diabetes = load_diabetes() +scaler = RobustScaler() +knn = KNeighborsRegressor(n_neighbors=5) + +pipeline = Pipeline([ + ('scaler', scaler), + ('knn', knn) +], memory="cache") + +pipeline.fit(diabetes.data, diabetes.target) +print(pipeline.named_steps['scaler'].center_) # Compliant +---- + +ifdef::env-github,rspecator-view[] + +== Implementation specification + +Message : Replace access with `named_steps` attribute of the Pipeline +Issue location : on the transformer `Name` before the dot +Secondary location : in the Pipeline array, when giving the object to the Pipeline + +Quickfix : possible when the Pipeline is created with the Pipeline constructor. With the make_pipeline, the names are automatically generated, so might be too complicated. + +endif::env-github,rspecator-view[] + + +== Resources +=== Documentation + +* Scikit-Learn - Pipelines and composite estimators : https://scikit-learn.org/stable/modules/compose.html#warning:-side-effect-of-caching-transformers[Side effect of caching transformers]