Source code for pyspark.ml.util

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json
import os
import time
import uuid
import functools
from typing import (
    Any,
    Callable,
    Dict,
    Generic,
    List,
    Optional,
    Sequence,
    Type,
    TypeVar,
    cast,
    TYPE_CHECKING,
)

from pyspark import since
from pyspark.ml.common import inherit_doc
from pyspark.sql import SparkSession
from pyspark.sql.utils import is_remote
from pyspark.util import VersionUtils

if TYPE_CHECKING:
    from py4j.java_gateway import JavaGateway, JavaObject
    from pyspark.ml._typing import PipelineStage
    from pyspark.ml.base import Params
    from pyspark.ml.wrapper import JavaWrapper
    from pyspark.core.context import SparkContext

T = TypeVar("T")
RW = TypeVar("RW", bound="BaseReadWrite")
W = TypeVar("W", bound="MLWriter")
JW = TypeVar("JW", bound="JavaMLWriter")
RL = TypeVar("RL", bound="MLReadable")
JR = TypeVar("JR", bound="JavaMLReader")

FuncT = TypeVar("FuncT", bound=Callable[..., Any])


def _jvm() -> "JavaGateway":
    """
    Returns the JVM view associated with SparkContext. Must be called
    after SparkContext is initialized.
    """
    from pyspark.core.context import SparkContext

    jvm = SparkContext._jvm
    if jvm:
        return jvm
    else:
        raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")


[docs]class Identifiable:
    """
    Object with a unique ID.
    """

    def __init__(self) -> None:
        #: A unique id for the object.
        self.uid = self._randomUID()

    def __repr__(self) -> str:
        return self.uid

    @classmethod
    def _randomUID(cls) -> str:
        """
        Generate a unique string id for the object. The default implementation
        concatenates the class name, "_", and 12 random hex chars.
        """
        return str(cls.__name__ + "_" + uuid.uuid4().hex[-12:])


[docs]@inherit_doc
class BaseReadWrite:
    """
    Base class for MLWriter and MLReader. Stores information about the SparkContext
    and SparkSession.

    .. versionadded:: 2.3.0
    """

    def __init__(self) -> None:
        self._sparkSession: Optional[SparkSession] = None

[docs]    def session(self: RW, sparkSession: SparkSession) -> RW:
        """
        Sets the Spark Session to use for saving/loading.
        """
        self._sparkSession = sparkSession
        return self

    @property
    def sparkSession(self) -> SparkSession:
        """
        Returns the user-specified Spark Session or the default.
        """
        if self._sparkSession is None:
            self._sparkSession = SparkSession._getActiveSessionOrCreate()
        assert self._sparkSession is not None
        return self._sparkSession

    @property
    def sc(self) -> "SparkContext":
        """
        Returns the underlying `SparkContext`.
        """
        assert self.sparkSession is not None
        return self.sparkSession.sparkContext


[docs]@inherit_doc
class MLWriter(BaseReadWrite):
    """
    Utility class that can save ML instances.

    .. versionadded:: 2.0.0
    """

    def __init__(self) -> None:
        super(MLWriter, self).__init__()
        self.shouldOverwrite: bool = False
        self.optionMap: Dict[str, Any] = {}

    def _handleOverwrite(self, path: str) -> None:
        from pyspark.ml.wrapper import JavaWrapper

        _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.util.FileSystemOverwrite")
        wrapper = JavaWrapper(_java_obj)
        wrapper._call_java("handleOverwrite", path, True, self.sparkSession._jsparkSession)

[docs]    def save(self, path: str) -> None:
        """Save the ML instance to the input path."""
        if self.shouldOverwrite:
            self._handleOverwrite(path)
        self.saveImpl(path)

[docs]    def saveImpl(self, path: str) -> None:
        """
        save() handles overwriting and then calls this method.  Subclasses should override this
        method to implement the actual saving of the instance.
        """
        raise NotImplementedError("MLWriter is not yet implemented for type: %s" % type(self))

[docs]    def overwrite(self) -> "MLWriter":
        """Overwrites if the output path already exists."""
        self.shouldOverwrite = True
        return self

[docs]    def option(self, key: str, value: Any) -> "MLWriter":
        """
        Adds an option to the underlying MLWriter. See the documentation for the specific model's
        writer for possible options. The option name (key) is case-insensitive.
        """
        self.optionMap[key.lower()] = str(value)
        return self


[docs]@inherit_doc
class GeneralMLWriter(MLWriter):
    """
    Utility class that can save ML instances in different formats.

    .. versionadded:: 2.4.0
    """

[docs]    def format(self, source: str) -> "GeneralMLWriter":
        """
        Specifies the format of ML export ("pmml", "internal", or the fully qualified class
        name for export).
        """
        self.source = source
        return self


@inherit_doc
class JavaMLWriter(MLWriter):
    """
    (Private) Specialization of :py:class:`MLWriter` for :py:class:`JavaParams` types
    """

    _jwrite: "JavaObject"

    def __init__(self, instance: "JavaMLWritable"):
        super(JavaMLWriter, self).__init__()
        _java_obj = instance._to_java()  # type: ignore[attr-defined]
        self._jwrite = _java_obj.write()

    def save(self, path: str) -> None:
        """Save the ML instance to the input path."""
        if not isinstance(path, str):
            raise TypeError("path should be a string, got type %s" % type(path))
        self._jwrite.save(path)

    def overwrite(self) -> "JavaMLWriter":
        """Overwrites if the output path already exists."""
        self._jwrite.overwrite()
        return self

    def option(self, key: str, value: str) -> "JavaMLWriter":
        self._jwrite.option(key, value)
        return self

    def session(self, sparkSession: SparkSession) -> "JavaMLWriter":
        """Sets the Spark Session to use for saving."""
        self._jwrite.session(sparkSession._jsparkSession)
        return self


@inherit_doc
class GeneralJavaMLWriter(JavaMLWriter):
    """
    (Private) Specialization of :py:class:`GeneralMLWriter` for :py:class:`JavaParams` types
    """

    def __init__(self, instance: "JavaMLWritable"):
        super(GeneralJavaMLWriter, self).__init__(instance)

    def format(self, source: str) -> "GeneralJavaMLWriter":
        """
        Specifies the format of ML export ("pmml", "internal", or the fully qualified class
        name for export).
        """
        self._jwrite.format(source)
        return self


[docs]@inherit_doc
class MLWritable:
    """
    Mixin for ML instances that provide :py:class:`MLWriter`.

    .. versionadded:: 2.0.0
    """

[docs]    def write(self) -> MLWriter:
        """Returns an MLWriter instance for this ML instance."""
        raise NotImplementedError("MLWritable is not yet implemented for type: %r" % type(self))

[docs]    def save(self, path: str) -> None:
        """Save this ML instance to the given path, a shortcut of 'write().save(path)'."""
        self.write().save(path)


@inherit_doc
class JavaMLWritable(MLWritable):
    """
    (Private) Mixin for ML instances that provide :py:class:`JavaMLWriter`.
    """

    def write(self) -> JavaMLWriter:
        """Returns an MLWriter instance for this ML instance."""
        return JavaMLWriter(self)


@inherit_doc
class GeneralJavaMLWritable(JavaMLWritable):
    """
    (Private) Mixin for ML instances that provide :py:class:`GeneralJavaMLWriter`.
    """

    def write(self) -> GeneralJavaMLWriter:
        """Returns an GeneralMLWriter instance for this ML instance."""
        return GeneralJavaMLWriter(self)


[docs]@inherit_doc
class MLReader(BaseReadWrite, Generic[RL]):
    """
    Utility class that can load ML instances.

    .. versionadded:: 2.0.0
    """

    def __init__(self) -> None:
        super(MLReader, self).__init__()

[docs]    def load(self, path: str) -> RL:
        """Load the ML instance from the input path."""
        raise NotImplementedError("MLReader is not yet implemented for type: %s" % type(self))


@inherit_doc
class JavaMLReader(MLReader[RL]):
    """
    (Private) Specialization of :py:class:`MLReader` for :py:class:`JavaParams` types
    """

    def __init__(self, clazz: Type["JavaMLReadable[RL]"]) -> None:
        super(JavaMLReader, self).__init__()
        self._clazz = clazz
        self._jread = self._load_java_obj(clazz).read()

    def load(self, path: str) -> RL:
        """Load the ML instance from the input path."""
        if not isinstance(path, str):
            raise TypeError("path should be a string, got type %s" % type(path))
        java_obj = self._jread.load(path)
        if not hasattr(self._clazz, "_from_java"):
            raise NotImplementedError(
                "This Java ML type cannot be loaded into Python currently: %r" % self._clazz
            )
        return self._clazz._from_java(java_obj)

    def session(self: JR, sparkSession: SparkSession) -> JR:
        """Sets the Spark Session to use for loading."""
        self._jread.session(sparkSession._jsparkSession)
        return self

    @classmethod
    def _java_loader_class(cls, clazz: Type["JavaMLReadable[RL]"]) -> str:
        """
        Returns the full class name of the Java ML instance. The default
        implementation replaces "pyspark" by "org.apache.spark" in
        the Python full class name.
        """
        java_package = clazz.__module__.replace("pyspark", "org.apache.spark")
        if clazz.__name__ in ("Pipeline", "PipelineModel"):
            # Remove the last package name "pipeline" for Pipeline and PipelineModel.
            java_package = ".".join(java_package.split(".")[0:-1])
        return java_package + "." + clazz.__name__

    @classmethod
    def _load_java_obj(cls, clazz: Type["JavaMLReadable[RL]"]) -> "JavaObject":
        """Load the peer Java object of the ML instance."""
        java_class = cls._java_loader_class(clazz)
        java_obj = _jvm()
        for name in java_class.split("."):
            java_obj = getattr(java_obj, name)
        return java_obj


[docs]@inherit_doc
class MLReadable(Generic[RL]):
    """
    Mixin for instances that provide :py:class:`MLReader`.

    .. versionadded:: 2.0.0
    """

[docs]    @classmethod
    def read(cls) -> MLReader[RL]:
        """Returns an MLReader instance for this class."""
        raise NotImplementedError("MLReadable.read() not implemented for type: %r" % cls)

[docs]    @classmethod
    def load(cls, path: str) -> RL:
        """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
        return cls.read().load(path)


@inherit_doc
class JavaMLReadable(MLReadable[RL]):
    """
    (Private) Mixin for instances that provide JavaMLReader.
    """

    @classmethod
    def read(cls) -> JavaMLReader[RL]:
        """Returns an MLReader instance for this class."""
        return JavaMLReader(cls)


[docs]@inherit_doc
class DefaultParamsWritable(MLWritable):
    """
    Helper trait for making simple :py:class:`Params` types writable.  If a :py:class:`Params`
    class stores all data as :py:class:`Param` values, then extending this trait will provide
    a default implementation of writing saved instances of the class.
    This only handles simple :py:class:`Param` types; e.g., it will not handle
    :py:class:`pyspark.sql.DataFrame`. See :py:class:`DefaultParamsReadable`, the counterpart
    to this class.

    .. versionadded:: 2.3.0
    """

[docs]    def write(self) -> MLWriter:
        """Returns a DefaultParamsWriter instance for this class."""
        from pyspark.ml.param import Params

        if isinstance(self, Params):
            return DefaultParamsWriter(self)
        else:
            raise TypeError(
                "Cannot use DefaultParamsWritable with type %s because it does not "
                + " extend Params.",
                type(self),
            )


[docs]@inherit_doc
class DefaultParamsWriter(MLWriter):
    """
    Specialization of :py:class:`MLWriter` for :py:class:`Params` types

    Class for writing Estimators and Transformers whose parameters are JSON-serializable.

    .. versionadded:: 2.3.0
    """

    def __init__(self, instance: "Params"):
        super(DefaultParamsWriter, self).__init__()
        self.instance = instance

[docs]    def saveImpl(self, path: str) -> None:
        DefaultParamsWriter.saveMetadata(self.instance, path, self.sc)

[docs]    @staticmethod
    def extractJsonParams(instance: "Params", skipParams: Sequence[str]) -> Dict[str, Any]:
        paramMap = instance.extractParamMap()
        jsonParams = {
            param.name: value for param, value in paramMap.items() if param.name not in skipParams
        }
        return jsonParams

[docs]    @staticmethod
    def saveMetadata(
        instance: "Params",
        path: str,
        sc: "SparkContext",
        extraMetadata: Optional[Dict[str, Any]] = None,
        paramMap: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Saves metadata + Params to: path + "/metadata"

        - class
        - timestamp
        - sparkVersion
        - uid
        - paramMap
        - defaultParamMap (since 2.4.0)
        - (optionally, extra metadata)

        Parameters
        ----------
        extraMetadata : dict, optional
            Extra metadata to be saved at same level as uid, paramMap, etc.
        paramMap : dict, optional
            If given, this is saved in the "paramMap" field.
        """
        metadataPath = os.path.join(path, "metadata")
        metadataJson = DefaultParamsWriter._get_metadata_to_save(
            instance, sc, extraMetadata, paramMap
        )
        sc.parallelize([metadataJson], 1).saveAsTextFile(metadataPath)

    @staticmethod
    def _get_metadata_to_save(
        instance: "Params",
        sc: "SparkContext",
        extraMetadata: Optional[Dict[str, Any]] = None,
        paramMap: Optional[Dict[str, Any]] = None,
    ) -> str:
        """
        Helper for :py:meth:`DefaultParamsWriter.saveMetadata` which extracts the JSON to save.
        This is useful for ensemble models which need to save metadata for many sub-models.

        Notes
        -----
        See :py:meth:`DefaultParamsWriter.saveMetadata` for details on what this includes.
        """
        uid = instance.uid
        cls = instance.__module__ + "." + instance.__class__.__name__

        # User-supplied param values
        params = instance._paramMap
        jsonParams = {}
        if paramMap is not None:
            jsonParams = paramMap
        else:
            for p in params:
                jsonParams[p.name] = params[p]

        # Default param values
        jsonDefaultParams = {}
        for p in instance._defaultParamMap:
            jsonDefaultParams[p.name] = instance._defaultParamMap[p]

        basicMetadata = {
            "class": cls,
            "timestamp": int(round(time.time() * 1000)),
            "sparkVersion": sc.version,
            "uid": uid,
            "paramMap": jsonParams,
            "defaultParamMap": jsonDefaultParams,
        }
        if extraMetadata is not None:
            basicMetadata.update(extraMetadata)
        return json.dumps(basicMetadata, separators=(",", ":"))


[docs]@inherit_doc
class DefaultParamsReadable(MLReadable[RL]):
    """
    Helper trait for making simple :py:class:`Params` types readable.
    If a :py:class:`Params` class stores all data as :py:class:`Param` values,
    then extending this trait will provide a default implementation of reading saved
    instances of the class. This only handles simple :py:class:`Param` types;
    e.g., it will not handle :py:class:`pyspark.sql.DataFrame`. See
    :py:class:`DefaultParamsWritable`, the counterpart to this class.

    .. versionadded:: 2.3.0
    """

[docs]    @classmethod
    def read(cls) -> "DefaultParamsReader[RL]":
        """Returns a DefaultParamsReader instance for this class."""
        return DefaultParamsReader(cls)


[docs]@inherit_doc
class DefaultParamsReader(MLReader[RL]):
    """
    Specialization of :py:class:`MLReader` for :py:class:`Params` types

    Default :py:class:`MLReader` implementation for transformers and estimators that
    contain basic (json-serializable) params and no data. This will not handle
    more complex params or types with data (e.g., models with coefficients).

    .. versionadded:: 2.3.0
    """

    def __init__(self, cls: Type[DefaultParamsReadable[RL]]):
        super(DefaultParamsReader, self).__init__()
        self.cls = cls

    @staticmethod
    def __get_class(clazz: str) -> Type[RL]:
        """
        Loads Python class from its name.
        """
        parts = clazz.split(".")
        module = ".".join(parts[:-1])
        m = __import__(module, fromlist=[parts[-1]])
        return getattr(m, parts[-1])

[docs]    def load(self, path: str) -> RL:
        metadata = DefaultParamsReader.loadMetadata(path, self.sc)
        py_type: Type[RL] = DefaultParamsReader.__get_class(metadata["class"])
        instance = py_type()
        cast("Params", instance)._resetUid(metadata["uid"])
        DefaultParamsReader.getAndSetParams(instance, metadata)
        return instance

[docs]    @staticmethod
    def loadMetadata(path: str, sc: "SparkContext", expectedClassName: str = "") -> Dict[str, Any]:
        """
        Load metadata saved using :py:meth:`DefaultParamsWriter.saveMetadata`

        Parameters
        ----------
        path : str
        sc : :py:class:`pyspark.SparkContext`
        expectedClassName : str, optional
            If non empty, this is checked against the loaded metadata.
        """
        metadataPath = os.path.join(path, "metadata")
        metadataStr = sc.textFile(metadataPath, 1).first()
        loadedVals = DefaultParamsReader._parseMetaData(metadataStr, expectedClassName)
        return loadedVals

    @staticmethod
    def _parseMetaData(metadataStr: str, expectedClassName: str = "") -> Dict[str, Any]:
        """
        Parse metadata JSON string produced by :py:meth`DefaultParamsWriter._get_metadata_to_save`.
        This is a helper function for :py:meth:`DefaultParamsReader.loadMetadata`.

        Parameters
        ----------
        metadataStr : str
            JSON string of metadata
        expectedClassName : str, optional
            If non empty, this is checked against the loaded metadata.
        """
        metadata = json.loads(metadataStr)
        className = metadata["class"]
        if len(expectedClassName) > 0:
            assert className == expectedClassName, (
                "Error loading metadata: Expected "
                + "class name {} but found class name {}".format(expectedClassName, className)
            )
        return metadata

[docs]    @staticmethod
    def getAndSetParams(
        instance: RL, metadata: Dict[str, Any], skipParams: Optional[List[str]] = None
    ) -> None:
        """
        Extract Params from metadata, and set them in the instance.
        """
        # Set user-supplied param values
        for paramName in metadata["paramMap"]:
            param = cast("Params", instance).getParam(paramName)
            if skipParams is None or paramName not in skipParams:
                paramValue = metadata["paramMap"][paramName]
                cast("Params", instance).set(param, paramValue)

        # Set default param values
        majorAndMinorVersions = VersionUtils.majorMinorVersion(metadata["sparkVersion"])
        major = majorAndMinorVersions[0]
        minor = majorAndMinorVersions[1]

        # For metadata file prior to Spark 2.4, there is no default section.
        if major > 2 or (major == 2 and minor >= 4):
            assert "defaultParamMap" in metadata, (
                "Error loading metadata: Expected " + "`defaultParamMap` section not found"
            )

            for paramName in metadata["defaultParamMap"]:
                paramValue = metadata["defaultParamMap"][paramName]
                cast("Params", instance)._setDefault(**{paramName: paramValue})

[docs]    @staticmethod
    def isPythonParamsInstance(metadata: Dict[str, Any]) -> bool:
        return metadata["class"].startswith("pyspark.ml.")

[docs]    @staticmethod
    def loadParamsInstance(path: str, sc: "SparkContext") -> RL:
        """
        Load a :py:class:`Params` instance from the given path, and return it.
        This assumes the instance inherits from :py:class:`MLReadable`.
        """
        metadata = DefaultParamsReader.loadMetadata(path, sc)
        if DefaultParamsReader.isPythonParamsInstance(metadata):
            pythonClassName = metadata["class"]
        else:
            pythonClassName = metadata["class"].replace("org.apache.spark", "pyspark")
        py_type: Type[RL] = DefaultParamsReader.__get_class(pythonClassName)
        instance = py_type.load(path)
        return instance


[docs]@inherit_doc
class HasTrainingSummary(Generic[T]):
    """
    Base class for models that provides Training summary.

    .. versionadded:: 3.0.0
    """

    @property
    @since("2.1.0")
    def hasSummary(self) -> bool:
        """
        Indicates whether a training summary exists for this model
        instance.
        """
        return cast("JavaWrapper", self)._call_java("hasSummary")

    @property
    @since("2.1.0")
    def summary(self) -> T:
        """
        Gets summary of the model trained on the training set. An exception is thrown if
        no summary exists.
        """
        return cast("JavaWrapper", self)._call_java("summary")


class MetaAlgorithmReadWrite:
    @staticmethod
    def isMetaEstimator(pyInstance: Any) -> bool:
        from pyspark.ml import Estimator, Pipeline
        from pyspark.ml.tuning import _ValidatorParams
        from pyspark.ml.classification import OneVsRest

        return (
            isinstance(pyInstance, Pipeline)
            or isinstance(pyInstance, OneVsRest)
            or (isinstance(pyInstance, Estimator) and isinstance(pyInstance, _ValidatorParams))
        )

    @staticmethod
    def getAllNestedStages(pyInstance: Any) -> List["Params"]:
        from pyspark.ml import Pipeline, PipelineModel
        from pyspark.ml.tuning import _ValidatorParams
        from pyspark.ml.classification import OneVsRest, OneVsRestModel

        # TODO: We need to handle `RFormulaModel.pipelineModel` here after Pyspark RFormulaModel
        #  support pipelineModel property.
        pySubStages: Sequence["Params"]

        if isinstance(pyInstance, Pipeline):
            pySubStages = pyInstance.getStages()
        elif isinstance(pyInstance, PipelineModel):
            pySubStages = cast(List["PipelineStage"], pyInstance.stages)
        elif isinstance(pyInstance, _ValidatorParams):
            raise ValueError("PySpark does not support nested validator.")
        elif isinstance(pyInstance, OneVsRest):
            pySubStages = [pyInstance.getClassifier()]
        elif isinstance(pyInstance, OneVsRestModel):
            pySubStages = [pyInstance.getClassifier()] + pyInstance.models  # type: ignore[operator]
        else:
            pySubStages = []

        nestedStages = []
        for pySubStage in pySubStages:
            nestedStages.extend(MetaAlgorithmReadWrite.getAllNestedStages(pySubStage))

        return [pyInstance] + nestedStages

    @staticmethod
    def getUidMap(instance: Any) -> Dict[str, "Params"]:
        nestedStages = MetaAlgorithmReadWrite.getAllNestedStages(instance)
        uidMap = {stage.uid: stage for stage in nestedStages}
        if len(nestedStages) != len(uidMap):
            raise RuntimeError(
                f"{instance.__class__.__module__}.{instance.__class__.__name__}"
                f".load found a compound estimator with stages with duplicate "
                f"UIDs. List of UIDs: {list(uidMap.keys())}."
            )
        return uidMap


def try_remote_functions(f: FuncT) -> FuncT:
    """Mark API supported from Spark Connect."""

    @functools.wraps(f)
    def wrapped(*args: Any, **kwargs: Any) -> Any:
        if is_remote() and "PYSPARK_NO_NAMESPACE_SHARE" not in os.environ:
            from pyspark.ml.connect import functions

            return getattr(functions, f.__name__)(*args, **kwargs)
        else:
            return f(*args, **kwargs)

    return cast(FuncT, wrapped)