## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#fromabcimportABCMeta,abstractmethodfromtypingimport(Any,Generic,List,Optional,TypeVar,Union,TYPE_CHECKING,Tuple,Callable,)importpandasaspdfrompysparkimportsincefrompyspark.ml.commonimportinherit_docfrompyspark.sql.dataframeimportDataFramefrompyspark.ml.paramimportParamsfrompyspark.ml.param.sharedimport(HasLabelCol,HasFeaturesCol,HasPredictionCol,)frompyspark.ml.connect.utilimporttransform_dataframe_columnifTYPE_CHECKING:frompyspark.ml._typingimportParamMapM=TypeVar("M",bound="Transformer")
[docs]@inherit_docclassEstimator(Params,Generic[M],metaclass=ABCMeta):""" Abstract class for estimators that fit models to data. .. versionadded:: 3.5.0 """@abstractmethoddef_fit(self,dataset:Union[DataFrame,pd.DataFrame])->M:""" Fits a model to the input dataset. This is called by the default implementation of fit. Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` input dataset Returns ------- :class:`Transformer` fitted model """raiseNotImplementedError()
[docs]deffit(self,dataset:Union[DataFrame,pd.DataFrame],params:Optional["ParamMap"]=None,)->Union[M,List[M]]:""" Fits a model to the input dataset with optional parameters. .. versionadded:: 3.5.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` or py:class:`pandas.DataFrame` input dataset, it can be either pandas dataframe or spark dataframe. params : a dict of param values, optional an optional param map that overrides embedded params. Returns ------- :py:class:`Transformer` fitted model """ifparamsisNone:params=dict()ifisinstance(params,dict):ifparams:returnself.copy(params)._fit(dataset)else:returnself._fit(dataset)else:raiseTypeError("Params must be either a param map or a list/tuple of param maps, ""but got %s."%type(params))
[docs]@inherit_docclassTransformer(Params,metaclass=ABCMeta):""" Abstract class for transformers that transform one dataset into another. .. versionadded:: 3.5.0 """def_input_columns(self)->List[str]:""" Return a list of input column names which are used as inputs of transformation. """raiseNotImplementedError()def_output_columns(self)->List[Tuple[str,str]]:""" Return a list of output transformed columns, each elements in the list is a tuple of (column_name, column_spark_type) """raiseNotImplementedError()def_get_transform_fn(self)->Callable[...,Any]:""" Return a transformation function that accepts one or more `pd.Series` instances as inputs and returns transformed result as an instance of `pd.Series` or `pd.DataFrame`. If there's only one output column, the transformed result must be an instance of `pd.Series`, if there are multiple output columns, the transformed result must be an instance of `pd.DataFrame` with column names matching output schema returned by `_output_columns` interface. """raiseNotImplementedError()
[docs]deftransform(self,dataset:Union[DataFrame,pd.DataFrame],params:Optional["ParamMap"]=None)->Union[DataFrame,pd.DataFrame]:""" Transforms the input dataset. The dataset can be either pandas dataframe or spark dataframe, if it is a spark DataFrame, the result of transformation is a new spark DataFrame that contains all existing columns and output columns with names, If it is a pandas DataFrame, the result of transformation is a shallow copy of the input pandas dataframe with output columns with names. Note: Transformers does not allow output column having the same name with existing columns. Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` or py:class:`pandas.DataFrame` input dataset. params : dict, optional an optional param map that overrides embedded params. Returns ------- :py:class:`pyspark.sql.DataFrame` or py:class:`pandas.DataFrame` transformed dataset, the type of output dataframe is consistent with input dataframe. """ifparamsisNone:params=dict()ifisinstance(params,dict):ifparams:returnself.copy(params)._transform(dataset)else:returnself._transform(dataset)
def_transform(self,dataset:Union[DataFrame,pd.DataFrame])->Union[DataFrame,pd.DataFrame]:input_cols=self._input_columns()transform_fn=self._get_transform_fn()output_cols=self._output_columns()existing_cols=list(dataset.columns)forcol_name,_inoutput_cols:ifcol_nameinexisting_cols:raiseValueError("Transformers does not allow output column having the same name with ""existing columns.")returntransform_dataframe_column(dataset,input_cols=input_cols,transform_fn=transform_fn,output_cols=output_cols,)
[docs]@inherit_docclassEvaluator(Params,metaclass=ABCMeta):""" Base class for evaluators that compute metrics from predictions. .. versionadded:: 3.5.0 """@abstractmethoddef_evaluate(self,dataset:Union["DataFrame","pd.DataFrame"])->float:""" Evaluates the output. Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` a dataset that contains labels/observations and predictions Returns ------- float metric """raiseNotImplementedError()
[docs]defevaluate(self,dataset:DataFrame,params:Optional["ParamMap"]=None)->float:""" Evaluates the output with optional parameters. .. versionadded:: 3.5.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` a dataset that contains labels/observations and predictions params : dict, optional an optional param map that overrides embedded params Returns ------- float metric """ifparamsisNone:params=dict()ifisinstance(params,dict):ifparams:returnself.copy(params)._evaluate(dataset)else:returnself._evaluate(dataset)else:raiseTypeError("Params must be a param map but got %s."%type(params))
[docs]@since("1.5.0")defisLargerBetter(self)->bool:""" Indicates whether the metric returned by :py:meth:`evaluate` should be maximized (True, default) or minimized (False). A given evaluator may support multiple metrics which may be maximized or minimized. """raiseNotImplementedError()
[docs]@inherit_docclassModel(Transformer,metaclass=ABCMeta):""" Abstract class for models that are fitted by estimators. .. versionadded:: 3.5.0 """pass
@inherit_docclass_PredictorParams(HasLabelCol,HasFeaturesCol,HasPredictionCol):""" Params for :py:class:`Predictor` and :py:class:`PredictorModel`. .. versionadded:: 3.5.0 """pass@inherit_docclassPredictor(Estimator[M],_PredictorParams,metaclass=ABCMeta):""" Estimator for prediction tasks (regression and classification). """@since("3.5.0")defsetLabelCol(self,value:str)->"Predictor":""" Sets the value of :py:attr:`labelCol`. """returnself._set(labelCol=value)@since("3.5.0")defsetFeaturesCol(self,value:str)->"Predictor":""" Sets the value of :py:attr:`featuresCol`. """returnself._set(featuresCol=value)@since("3.5.0")defsetPredictionCol(self,value:str)->"Predictor":""" Sets the value of :py:attr:`predictionCol`. """returnself._set(predictionCol=value)@inherit_docclassPredictionModel(Model,_PredictorParams,metaclass=ABCMeta):""" Model for prediction tasks (regression and classification). """@since("3.5.0")defsetFeaturesCol(self,value:str)->"PredictionModel":""" Sets the value of :py:attr:`featuresCol`. """returnself._set(featuresCol=value)@since("3.5.0")defsetPredictionCol(self,value:str)->"PredictionModel":""" Sets the value of :py:attr:`predictionCol`. """returnself._set(predictionCol=value)@property@abstractmethod@since("3.5.0")defnumFeatures(self)->int:""" Returns the number of features the model was trained on. If unknown, returns -1 """raiseNotImplementedError()