diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ccdc098900..f0c14d5c2b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -10,7 +10,7 @@ # The AI Platform GAPIC libraries are owned by Cloud AI DPE /google/cloud/aiplatform* @googleapis/cdpe-cloudai -# The AI Platform SDK is owned by Model Builder SDK Dev team +# The Vertex SDK is owned by Model Builder SDK Dev team /google/cloud/aiplatform/* @googleapis/cloud-aiplatform-model-builder-sdk /tests/unit/aiplatform/* @googleapis/cloud-aiplatform-model-builder-sdk diff --git a/.repo-metadata.json b/.repo-metadata.json index 8a9aaf6bcc..46b1493222 100644 --- a/.repo-metadata.json +++ b/.repo-metadata.json @@ -4,7 +4,7 @@ "product_documentation": "https://cloud.google.com/ai-platform", "client_documentation": "https://googleapis.dev/python/aiplatform/latest", "issue_tracker": "https://issuetracker.google.com/savedsearches/559744", - "release_level": "beta", + "release_level": "ga", "language": "python", "library_type": "GAPIC_COMBO", "repo": "googleapis/python-aiplatform", diff --git a/CHANGELOG.md b/CHANGELOG.md index 02b44159f8..fb6c5f32b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Changelog +## [1.0.0](https://www.github.com/googleapis/python-aiplatform/compare/v0.9.0...v1.0.0) (2021-05-19) + + +### Features + +* add custom and hp tuning ([#388](https://www.github.com/googleapis/python-aiplatform/issues/388)) ([aab9e58](https://www.github.com/googleapis/python-aiplatform/commit/aab9e589426331bfe7ac3f6efa97109e0bd0db0d)) +* add tensorboard support to custom job and hyperparameter tuning job ([#404](https://www.github.com/googleapis/python-aiplatform/issues/404)) ([fa9bc39](https://www.github.com/googleapis/python-aiplatform/commit/fa9bc3943df55bc0d077ba9b02101ae792a6fb57)) + + +### Bug Fixes + +* tb-gcp-uploader to show flags in "--help" correctly ([#409](https://www.github.com/googleapis/python-aiplatform/issues/409)) ([9f603dd](https://www.github.com/googleapis/python-aiplatform/commit/9f603dd57868d893cb3be6cf70686fdce2706a6c)) + + +### Miscellaneous Chores + +* release 1.0.0 ([#407](https://www.github.com/googleapis/python-aiplatform/issues/407)) ([a2d7b68](https://www.github.com/googleapis/python-aiplatform/commit/a2d7b68e4016965f6e3771053f77e1745b44c403)) + ## [0.9.0](https://www.github.com/googleapis/python-aiplatform/compare/v0.8.0...v0.9.0) (2021-05-17) diff --git a/README.rst b/README.rst index 209b577ead..57ead60fea 100644 --- a/README.rst +++ b/README.rst @@ -1,23 +1,23 @@ -Python Client for Cloud AI Platform +Vertex SDK for Python ================================================= -|beta| |pypi| |versions| +|GA| |pypi| |versions| -`Cloud AI Platform`_: Google Cloud AI Platform is an integrated suite of machine learning tools and services for building and using ML models with AutoML or custom code. It offers both novices and experts the best workbench for the entire machine learning development lifecycle. +`Vertex AI`_: Google Vertex AI is an integrated suite of machine learning tools and services for building and using ML models with AutoML or custom code. It offers both novices and experts the best workbench for the entire machine learning development lifecycle. - `Client Library Documentation`_ - `Product Documentation`_ -.. |beta| image:: https://img.shields.io/badge/support-beta-orange.svg - :target: https://github.com/googleapis/google-cloud-python/blob/master/README.rst#beta-support +.. |GA| image:: https://img.shields.io/badge/support-ga-gold.svg + :target: https://github.com/googleapis/google-cloud-python/blob/master/README.rst#general-availability .. |pypi| image:: https://img.shields.io/pypi/v/google-cloud-aiplatform.svg :target: https://pypi.org/project/google-cloud-aiplatform/ .. |versions| image:: https://img.shields.io/pypi/pyversions/google-cloud-aiplatform.svg :target: https://pypi.org/project/google-cloud-aiplatform/ -.. _Cloud AI Platform: https://cloud.google.com/ai-platform-unified/docs +.. _Vertex AI: https://cloud.google.com/vertex-ai/docs .. _Client Library Documentation: https://googleapis.dev/python/aiplatform/latest -.. _Product Documentation: https://cloud.google.com/ai-platform-unified/docs +.. _Product Documentation: https://cloud.google.com/vertex-ai/docs Quick Start ----------- @@ -26,12 +26,12 @@ In order to use this library, you first need to go through the following steps: 1. `Select or create a Cloud Platform project.`_ 2. `Enable billing for your project.`_ -3. `Enable the Cloud AI Platform API.`_ +3. `Enable the Vertex AI API.`_ 4. `Setup Authentication.`_ .. _Select or create a Cloud Platform project.: https://console.cloud.google.com/project .. _Enable billing for your project.: https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project -.. _Enable the Cloud AI Platform API.: https://cloud.google.com/ai-platform/docs +.. _Enable the Vertex AI API.: https://cloud.google.com/ai-platform/docs .. _Setup Authentication.: https://googleapis.dev/python/google-api-core/latest/auth.html Installation @@ -45,7 +45,7 @@ With `virtualenv`_, it's possible to install this library without needing system install permissions, and without clashing with the installed system dependencies. -.. _`virtualenv`: https://virtualenv.pypa.io/en/latest/ +.. _virtualenv: https://virtualenv.pypa.io/en/latest/ Mac/Linux @@ -69,15 +69,262 @@ Windows \Scripts\activate \Scripts\pip.exe install google-cloud-aiplatform + +Overview +~~~~~~~~ +This section provides a brief overview of the Vertex SDK for Python. You can also reference the notebooks in `vertex-ai-samples`_ for examples. + +.. _vertex-ai-samples: https://github.com/GoogleCloudPlatform/ai-platform-samples/tree/master/ai-platform-unified/notebooks/unofficial/sdk + +Importing +^^^^^^^^^ +SDK functionality can be used from the root of the package: + +.. code-block:: Python + + from google.cloud import aiplatform + + +Initialization +^^^^^^^^^^^^^^ +Initialize the SDK to store common configurations that you use with the SDK. + +.. code-block:: Python + + aiplatform.init( + # your Google Cloud Project ID or number + # environment default used is not set + project='my-project', + + # the Vertex AI region you will use + # defaults to us-central1 + location='us-central1', + + # Googlge Cloud Stoage bucket in same region as location + # used to stage artifacts + staging_bucket='gs://my_staging_bucket', + + # custom google.auth.credentials.Credentials + # environment default creds used if not set + credentials=my_credentials, + + # customer managed encryption key resource name + # will be applied to all Vertex AI resources if set + encryption_spec_key_name=my_encryption_key_name, + + # the name of the experiment to use to track + # logged metrics and parameters + experiment='my-experiment', + + # description of the experiment above + experiment_description='my experiment decsription' + ) + +Datasets +^^^^^^^^ +Vertex AI provides managed tabular, text, image, and video datasets. In the SDK, datasets can be used downstream to +train models. + +To create a tabular dataset: + +.. code-block:: Python + + my_dataset = aiplatform.TabularDataset.create( + display_name="my-dataset", gcs_source=['gs://path/to/my/dataset.csv']) + +You can also create and import a dataset in separate steps: + +.. code-block:: Python + + from google.cloud import aiplatform + + my_dataset = aiplatform.TextDataset.create( + display_name="my-dataset") + + my_dataset.import( + gcs_source=['gs://path/to/my/dataset.csv'] + import_schema_uri=aiplatform.schema.dataset.ioformat.text.multi_label_classification + ) + +To get a previously created Dataset: + +.. code-block:: Python + + dataset = aiplatform.ImageDataset('projects/my-project/location/us-central1/datasets/{DATASET_ID}') + +Vertex AI supports a variety of dataset schemas. References to these schemas are available under the +:code:`aiplatform.schema.dataset` namespace. For more information on the supported dataset schemas please refer to the +`Preparing data docs`_. + +.. _Preparing data docs: https://cloud.google.com/ai-platform-unified/docs/datasets/prepare + +Training +^^^^^^^^ +The Vertex SDK for Python allows you train Custom and AutoML Models. + +You can train custom models using a custom Python script, custom Python package, or container. + +**Preparing Your Custom Code** + +Vertex AI custom training enables you to train on Vertex AI datasets and produce Vertex AI models. To do so your +script must adhere to the following contract: + +It must read datasets from the environment variables populated by the training service: + +.. code-block:: Python + + os.environ['AIP_DATA_FORMAT'] # provides format of data + os.environ['AIP_TRAINING_DATA_URI'] # uri to training split + os.environ['AIP_VALIDATION_DATA_URI'] # uri to validation split + os.environ['AIP_TEST_DATA_URI'] # uri to test split + +Please visit `Using a managed dataset in a custom training application`_ for a detailed overview. + +.. _Using a managed dataset in a custom training application: https://cloud.google.com/vertex-ai/docs/training/using-managed-datasets + +It must write the model artifact to the environment variable populated by the traing service: + +.. code-block:: Python + + os.environ['AIP_MODEL_DIR'] + +**Running Training** + +.. code-block:: Python + + job = aiplatform.CustomTrainingJob( + display_name="my-training-job", + script_path="training_script.py", + container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest", + requirements=["gcsfs==0.7.1"], + model_serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest", + ) + + model = job.run(my_dataset, + replica_count=1, + machine_type="n1-standard-4", + accelerator_type='NVIDIA_TESLA_K80', + accelerator_count=1) + +In the code block above `my_dataset` is managed dataset created in the `Dataset` section above. The `model` variable is a managed Vertex AI model that can be deployed or exported. + + +AutoMLs +------- +The Vertex SDK for Python supports AutoML tabular, image, text, video, and forecasting. + +To train an AutoML tabular model: + +.. code-block:: Python + + dataset = aiplatform.TabularDataset('projects/my-project/location/us-central1/datasets/{DATASET_ID}') + + job = aiplatform.AutoMLTabularTrainingJob( + display_name="train-automl", + optimization_prediction_type="regression", + optimization_objective="minimize-rmse", + ) + + model = job.run( + dataset=dataset, + target_column="target_column_name", + training_fraction_split=0.6, + validation_fraction_split=0.2, + test_fraction_split=0.2, + budget_milli_node_hours=1000, + model_display_name="my-automl-model", + disable_early_stopping=False, + ) + + +Models +------ + +To deploy a model: + + +.. code-block:: Python + + endpoint = model.deploy(machine_type="n1-standard-4", + min_replica_count=1, + max_replica_count=5 + machine_type='n1-standard-4', + accelerator_type='NVIDIA_TESLA_K80', + accelerator_count=1) + + +To upload a model: + +.. code-block:: Python + + model = aiplatform.Model.upload( + display_name='my-model', + artifact_uri="gs://python/to/my/model/dir", + serving_container_image_uri="gcr.io/cloud-aiplatform/prediction/tf2-cpu.2-2:latest", + ) + +To get a model: + +.. code-block:: Python + + model = aiplatform.Model('/projects/my-project/locations/us-central1/models/{MODEL_ID}') + +Please visit `Importing models to Vertex AI`_ for a detailed overview: + +.. _Importing models to Vertex AI: https://cloud.google.com/vertex-ai/docs/general/import-model + + +Endpoints +--------- + +To get predictions from endpoints: + +.. code-block:: Python + + endpoint.predict(instances=[[6.7, 3.1, 4.7, 1.5], [4.6, 3.1, 1.5, 0.2]]) + + +To create an endpoint + +.. code-block:: Python + + endpoint = endpoint.create(display_name='my-endpoint') + +To deploy a model to a created endpoint: + +.. code-block:: Python + + model = aiplatform.Model('/projects/my-project/locations/us-central1/models/{MODEL_ID}') + + endpoint.deploy(model, + min_replica_count=1, + max_replica_count=5 + machine_type='n1-standard-4', + accelerator_type='NVIDIA_TESLA_K80', + accelerator_count=1) + +To undeploy models from an endpoint: + +.. code-block:: Python + + endpoint.undeploy_all() + +To delete an endpoint: + +.. code-block:: Python + + endpoint.delete() + + Next Steps ~~~~~~~~~~ -- Read the `Client Library Documentation`_ for Cloud AI Platform +- Read the `Client Library Documentation`_ for Vertex AI API to see other available methods on the client. -- Read the `Cloud AI Platform API Product documentation`_ to learn +- Read the `Vertex AI API Product documentation`_ to learn more about the product and see How-to Guides. - View this `README`_ to see the full list of Cloud APIs that we cover. -.. _Cloud AI Platform API Product documentation: https://cloud.google.com/ai-platform-unified/docs +.. _Vertex AI API Product documentation: https://cloud.google.com/vertex-ai/docs .. _README: https://github.com/googleapis/google-cloud-python/blob/master/README.rst \ No newline at end of file diff --git a/google/cloud/aiplatform/__init__.py b/google/cloud/aiplatform/__init__.py index e56e57a2ad..6aa8f64161 100644 --- a/google/cloud/aiplatform/__init__.py +++ b/google/cloud/aiplatform/__init__.py @@ -26,9 +26,15 @@ TimeSeriesDataset, VideoDataset, ) +from google.cloud.aiplatform import hyperparameter_tuning +from google.cloud.aiplatform.metadata import metadata from google.cloud.aiplatform.models import Endpoint from google.cloud.aiplatform.models import Model -from google.cloud.aiplatform.jobs import BatchPredictionJob +from google.cloud.aiplatform.jobs import ( + BatchPredictionJob, + CustomJob, + HyperparameterTuningJob, +) from google.cloud.aiplatform.training_jobs import ( CustomTrainingJob, CustomContainerTrainingJob, @@ -39,7 +45,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, ) -from google.cloud.aiplatform.metadata import metadata """ Usage: @@ -60,6 +65,7 @@ "explain", "gapic", "init", + "hyperparameter_tuning", "log_params", "log_metrics", "get_experiment_df", @@ -71,11 +77,13 @@ "AutoMLTextTrainingJob", "AutoMLVideoTrainingJob", "BatchPredictionJob", + "CustomJob", "CustomTrainingJob", "CustomContainerTrainingJob", "CustomPythonPackageTrainingJob", "Endpoint", "ImageDataset", + "HyperparameterTuningJob", "Model", "TabularDataset", "TextDataset", diff --git a/google/cloud/aiplatform/base.py b/google/cloud/aiplatform/base.py index f46db9c47e..f183c1aedc 100644 --- a/google/cloud/aiplatform/base.py +++ b/google/cloud/aiplatform/base.py @@ -60,14 +60,14 @@ def __init__(self, name: str = ""): def log_create_with_lro( self, - cls: Type["AiPlatformResourceNoun"], + cls: Type["VertexAiResourceNoun"], lro: Optional[operation.Operation] = None, ): """Logs create event with LRO. Args: - cls (AiPlatformResourceNoune): - AI Platform Resource Noun class that is being created. + cls (VertexAiResourceNoun): + Vertex AI Resource Noun class that is being created. lro (operation.Operation): Optional. Backing LRO for creation. """ @@ -80,7 +80,7 @@ def log_create_with_lro( def log_create_complete( self, - cls: Type["AiPlatformResourceNoun"], + cls: Type["VertexAiResourceNoun"], resource: proto.Message, variable_name: str, ): @@ -89,10 +89,10 @@ def log_create_complete( Will also include code snippet to instantiate resource in SDK. Args: - cls (AiPlatformResourceNoun): - AI Platform Resource Noun class that is being created. + cls (VertexAiResourceNoun): + Vertex AI Resource Noun class that is being created. resource (proto.Message): - AI Platform Resourc proto.Message + Vertex AI Resourc proto.Message variable_name (str): Name of variable to use for code snippet """ self._logger.info(f"{cls.__name__} created. Resource name: {resource.name}") @@ -101,15 +101,38 @@ def log_create_complete( f"{variable_name} = aiplatform.{cls.__name__}('{resource.name}')" ) + def log_create_complete_with_getter( + self, + cls: Type["VertexAiResourceNoun"], + resource: proto.Message, + variable_name: str, + ): + """Logs create event is complete. + + Will also include code snippet to instantiate resource in SDK. + + Args: + cls (VertexAiResourceNoun): + Vertex AI Resource Noun class that is being created. + resource (proto.Message): + Vertex AI Resourc proto.Message + variable_name (str): Name of variable to use for code snippet + """ + self._logger.info(f"{cls.__name__} created. Resource name: {resource.name}") + self._logger.info(f"To use this {cls.__name__} in another session:") + self._logger.info( + f"{variable_name} = aiplatform.{cls.__name__}.get('{resource.name}')" + ) + def log_action_start_against_resource( - self, action: str, noun: str, resource_noun_obj: "AiPlatformResourceNoun" + self, action: str, noun: str, resource_noun_obj: "VertexAiResourceNoun" ): """Logs intention to start an action against a resource. Args: action (str): Action to complete against the resource ie: "Deploying". Can be empty string. noun (str): Noun the action acts on against the resource. Can be empty string. - resource_noun_obj (AiPlatformResourceNoun): + resource_noun_obj (VertexAiResourceNoun): Resource noun object the action is acting against. """ self._logger.info( @@ -120,7 +143,7 @@ def log_action_started_against_resource_with_lro( self, action: str, noun: str, - cls: Type["AiPlatformResourceNoun"], + cls: Type["VertexAiResourceNoun"], lro: operation.Operation, ): """Logs an action started against a resource with lro. @@ -128,7 +151,7 @@ def log_action_started_against_resource_with_lro( Args: action (str): Action started against resource. ie: "Deploy". Can be empty string. noun (str): Noun the action acts on against the resource. Can be empty string. - cls (AiPlatformResourceNoun): + cls (VertexAiResourceNoun): Resource noun object the action is acting against. lro (operation.Operation): Backing LRO for action. """ @@ -137,14 +160,14 @@ def log_action_started_against_resource_with_lro( ) def log_action_completed_against_resource( - self, noun: str, action: str, resource_noun_obj: "AiPlatformResourceNoun" + self, noun: str, action: str, resource_noun_obj: "VertexAiResourceNoun" ): """Logs action completed against resource. Args: noun (str): Noun the action acts on against the resource. Can be empty string. action (str): Action started against resource. ie: "Deployed". Can be empty string. - resource_noun_obj (AiPlatformResourceNoun): + resource_noun_obj (VertexAiResourceNoun): Resource noun object the action is acting against """ self._logger.info( @@ -362,8 +385,8 @@ def __repr__(self) -> str: return object.__repr__(self) -class AiPlatformResourceNoun(metaclass=abc.ABCMeta): - """Base class the AI Platform resource nouns. +class VertexAiResourceNoun(metaclass=abc.ABCMeta): + """Base class the Vertex AI resource nouns. Subclasses require two class attributes: @@ -377,7 +400,7 @@ class AiPlatformResourceNoun(metaclass=abc.ABCMeta): @property @classmethod @abc.abstractmethod - def client_class(cls) -> Type[utils.AiPlatformServiceClientWithOverride]: + def client_class(cls) -> Type[utils.VertexAiServiceClientWithOverride]: """Client class required to interact with resource with optional overrides.""" pass @@ -441,7 +464,7 @@ def _instantiate_client( cls, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> utils.AiPlatformServiceClientWithOverride: + ) -> utils.VertexAiServiceClientWithOverride: """Helper method to instantiate service client for resource noun. Args: @@ -450,7 +473,7 @@ def _instantiate_client( Optional custom credentials to use when accessing interacting with resource noun. Returns: - client (utils.AiPlatformServiceClientWithOverride): + client (utils.VertexAiServiceClientWithOverride): Initialized service client for this service noun with optional overrides. """ return initializer.global_config.create_client( @@ -543,6 +566,11 @@ def update_time(self) -> datetime.datetime: self._sync_gca_resource() return self._gca_resource.update_time + @property + def gca_resource(self) -> proto.Message: + """The underlying resource proto represenation.""" + return self._gca_resource + def __repr__(self) -> str: return f"{object.__repr__(self)} \nresource name: {self.resource_name}" @@ -552,7 +580,7 @@ def optional_sync( return_input_arg: Optional[str] = None, bind_future_to_self: bool = True, ): - """Decorator for AiPlatformResourceNounWithFutureManager with optional sync + """Decorator for VertexAiResourceNounWithFutureManager with optional sync support. Methods with this decorator should include a "sync" argument that defaults to @@ -686,8 +714,8 @@ def wrapper(*args, **kwargs): return optional_run_in_thread -class AiPlatformResourceNounWithFutureManager(AiPlatformResourceNoun, FutureManager): - """Allows optional asynchronous calls to this AI Platform Resource +class VertexAiResourceNounWithFutureManager(VertexAiResourceNoun, FutureManager): + """Allows optional asynchronous calls to this Vertex AI Resource Nouns.""" def __init__( @@ -707,7 +735,7 @@ def __init__( resource noun. resource_name(str): A fully-qualified resource name or ID. """ - AiPlatformResourceNoun.__init__( + VertexAiResourceNoun.__init__( self, project=project, location=location, @@ -723,7 +751,7 @@ def _empty_constructor( location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, resource_name: Optional[str] = None, - ) -> "AiPlatformResourceNounWithFutureManager": + ) -> "VertexAiResourceNounWithFutureManager": """Initializes with all attributes set to None. The attributes should be populated after a future is complete. This allows @@ -740,7 +768,7 @@ def _empty_constructor( An instance of this class with attributes set to None. """ self = cls.__new__(cls) - AiPlatformResourceNoun.__init__( + VertexAiResourceNoun.__init__( self, project=project, location=location, @@ -752,12 +780,12 @@ def _empty_constructor( return self def _sync_object_with_future_result( - self, result: "AiPlatformResourceNounWithFutureManager" + self, result: "VertexAiResourceNounWithFutureManager" ): """Populates attributes from a Future result to this object. Args: - result: AiPlatformResourceNounWithFutureManager + result: VertexAiResourceNounWithFutureManager Required. Result of future with same type as this object. """ sync_attributes = [ @@ -783,12 +811,12 @@ def _construct_sdk_resource_from_gapic( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> AiPlatformResourceNoun: + ) -> VertexAiResourceNoun: """Given a GAPIC resource object, return the SDK representation. Args: gapic_resource (proto.Message): - A GAPIC representation of an AI Platform resource, usually + A GAPIC representation of an Vertex AI resource, usually retrieved by a get_* or in a list_* API call. project (str): Optional. Project to construct SDK object from. If not set, @@ -801,7 +829,7 @@ def _construct_sdk_resource_from_gapic( Overrides credentials set in aiplatform.init. Returns: - AiPlatformResourceNoun: + VertexAiResourceNoun: An initialized SDK object that represents GAPIC type. """ sdk_resource = self._empty_constructor( @@ -821,8 +849,8 @@ def _list( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List[AiPlatformResourceNoun]: - """Private method to list all instances of this AI Platform Resource, + ) -> List[VertexAiResourceNoun]: + """Private method to list all instances of this Vertex AI Resource, takes a `cls_filter` arg to filter to a particular SDK resource subclass. @@ -850,7 +878,7 @@ def _list( credentials set in aiplatform.init. Returns: - List[AiPlatformResourceNoun] - A list of SDK resource objects + List[VertexAiResourceNoun] - A list of SDK resource objects """ self = cls._empty_constructor( project=project, location=location, credentials=credentials @@ -890,8 +918,8 @@ def _list_with_local_order( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List[AiPlatformResourceNoun]: - """Private method to list all instances of this AI Platform Resource, + ) -> List[VertexAiResourceNoun]: + """Private method to list all instances of this Vertex AI Resource, takes a `cls_filter` arg to filter to a particular SDK resource subclass. Provides client-side sorting when a list API doesn't support `order_by`. @@ -920,7 +948,7 @@ def _list_with_local_order( credentials set in aiplatform.init. Returns: - List[AiPlatformResourceNoun] - A list of SDK resource objects + List[VertexAiResourceNoun] - A list of SDK resource objects """ li = cls._list( @@ -952,8 +980,8 @@ def list( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List[AiPlatformResourceNoun]: - """List all instances of this AI Platform Resource. + ) -> List[VertexAiResourceNoun]: + """List all instances of this Vertex AI Resource. Example Usage: @@ -982,7 +1010,7 @@ def list( credentials set in aiplatform.init. Returns: - List[AiPlatformResourceNoun] - A list of SDK resource objects + List[VertexAiResourceNoun] - A list of SDK resource objects """ return cls._list( @@ -995,7 +1023,7 @@ def list( @optional_sync() def delete(self, sync: bool = True) -> None: - """Deletes this AI Platform resource. WARNING: This deletion is + """Deletes this Vertex AI resource. WARNING: This deletion is permament. Args: @@ -1014,7 +1042,7 @@ def delete(self, sync: bool = True) -> None: def __repr__(self) -> str: if self._gca_resource: - return AiPlatformResourceNoun.__repr__(self) + return VertexAiResourceNoun.__repr__(self) return FutureManager.__repr__(self) diff --git a/google/cloud/aiplatform/compat/__init__.py b/google/cloud/aiplatform/compat/__init__.py index 980c554fe1..55a72fea16 100644 --- a/google/cloud/aiplatform/compat/__init__.py +++ b/google/cloud/aiplatform/compat/__init__.py @@ -70,6 +70,7 @@ types.prediction_service = types.prediction_service_v1beta1 types.specialist_pool = types.specialist_pool_v1beta1 types.specialist_pool_service = types.specialist_pool_service_v1beta1 + types.study = types.study_v1beta1 types.training_pipeline = types.training_pipeline_v1beta1 types.metadata_service = types.metadata_service_v1beta1 types.tensorboard_service = types.tensorboard_service_v1beta1 @@ -120,6 +121,7 @@ types.prediction_service = types.prediction_service_v1 types.specialist_pool = types.specialist_pool_v1 types.specialist_pool_service = types.specialist_pool_service_v1 + types.study = types.study_v1 types.training_pipeline = types.training_pipeline_v1 __all__ = ( diff --git a/google/cloud/aiplatform/compat/types/__init__.py b/google/cloud/aiplatform/compat/types/__init__.py index f45bb2e11e..7bd512e7e8 100644 --- a/google/cloud/aiplatform/compat/types/__init__.py +++ b/google/cloud/aiplatform/compat/types/__init__.py @@ -49,6 +49,7 @@ prediction_service as prediction_service_v1beta1, specialist_pool as specialist_pool_v1beta1, specialist_pool_service as specialist_pool_service_v1beta1, + study as study_v1beta1, training_pipeline as training_pipeline_v1beta1, metadata_service as metadata_service_v1beta1, tensorboard_service as tensorboard_service_v1beta1, @@ -90,6 +91,7 @@ prediction_service as prediction_service_v1, specialist_pool as specialist_pool_v1, specialist_pool_service as specialist_pool_service_v1, + study as study_v1, training_pipeline as training_pipeline_v1, ) diff --git a/google/cloud/aiplatform/datasets/_datasources.py b/google/cloud/aiplatform/datasets/_datasources.py index ea436eb91b..9323f40382 100644 --- a/google/cloud/aiplatform/datasets/_datasources.py +++ b/google/cloud/aiplatform/datasets/_datasources.py @@ -46,7 +46,7 @@ def import_data_config(self): class TabularDatasource(Datasource): - """Datasource for creating a tabular dataset for AI Platform.""" + """Datasource for creating a tabular dataset for Vertex AI.""" def __init__( self, @@ -99,7 +99,7 @@ def dataset_metadata(self) -> Optional[Dict]: class NonTabularDatasource(Datasource): - """Datasource for creating an empty non-tabular dataset for AI Platform.""" + """Datasource for creating an empty non-tabular dataset for Vertex AI.""" @property def dataset_metadata(self) -> Optional[Dict]: @@ -107,7 +107,7 @@ def dataset_metadata(self) -> Optional[Dict]: class NonTabularDatasourceImportable(NonTabularDatasource, DatasourceImportable): - """Datasource for creating a non-tabular dataset for AI Platform and + """Datasource for creating a non-tabular dataset for Vertex AI and importing data to the dataset.""" def __init__( diff --git a/google/cloud/aiplatform/datasets/dataset.py b/google/cloud/aiplatform/datasets/dataset.py index 44dadc4ee4..1eb1663b2b 100644 --- a/google/cloud/aiplatform/datasets/dataset.py +++ b/google/cloud/aiplatform/datasets/dataset.py @@ -35,8 +35,8 @@ _LOGGER = base.Logger(__name__) -class _Dataset(base.AiPlatformResourceNounWithFutureManager): - """Managed dataset resource for AI Platform.""" +class _Dataset(base.VertexAiResourceNounWithFutureManager): + """Managed dataset resource for Vertex AI.""" client_class = utils.DatasetClientWithOverride _is_client_prediction_client = False @@ -264,7 +264,7 @@ def _create_and_import( that can be used here are found in gs://google-cloud- aiplatform/schema/dataset/metadata/. datasource (_datasources.Datasource): - Required. Datasource for creating a dataset for AI Platform. + Required. Datasource for creating a dataset for Vertex AI. project (str): Required. Project to upload this model to. Overrides project set in aiplatform.init. @@ -368,7 +368,7 @@ def _create( that can be used here are found in gs://google-cloud- aiplatform/schema/dataset/metadata/. datasource (_datasources.Datasource): - Required. Datasource for creating a dataset for AI Platform. + Required. Datasource for creating a dataset for Vertex AI. request_metadata (Sequence[Tuple[str, str]]): Strings which should be sent along with the create_dataset request as metadata. Usually to specify special dataset config. @@ -401,7 +401,7 @@ def _import( Args: datasource (_datasources.DatasourceImportable): - Required. Datasource for importing data to an existing dataset for AI Platform. + Required. Datasource for importing data to an existing dataset for Vertex AI. Returns: operation (Operation): @@ -528,7 +528,7 @@ def list( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List[base.AiPlatformResourceNoun]: + ) -> List[base.VertexAiResourceNoun]: """List all instances of this Dataset resource. Example Usage: @@ -557,7 +557,7 @@ def list( credentials set in aiplatform.init. Returns: - List[base.AiPlatformResourceNoun] - A list of Dataset resource objects + List[base.VertexAiResourceNoun] - A list of Dataset resource objects """ dataset_subclass_filter = ( diff --git a/google/cloud/aiplatform/datasets/image_dataset.py b/google/cloud/aiplatform/datasets/image_dataset.py index c2b3ca68b5..506338c915 100644 --- a/google/cloud/aiplatform/datasets/image_dataset.py +++ b/google/cloud/aiplatform/datasets/image_dataset.py @@ -27,7 +27,7 @@ class ImageDataset(datasets._Dataset): - """Managed image dataset resource for AI Platform.""" + """Managed image dataset resource for Vertex AI.""" _supported_metadata_schema_uris: Optional[Tuple[str]] = ( schema.dataset.metadata.image, diff --git a/google/cloud/aiplatform/datasets/tabular_dataset.py b/google/cloud/aiplatform/datasets/tabular_dataset.py index b80266cf00..95f1b16f98 100644 --- a/google/cloud/aiplatform/datasets/tabular_dataset.py +++ b/google/cloud/aiplatform/datasets/tabular_dataset.py @@ -33,7 +33,7 @@ class TabularDataset(datasets._Dataset): - """Managed tabular dataset resource for AI Platform.""" + """Managed tabular dataset resource for Vertex AI.""" _supported_metadata_schema_uris: Optional[Tuple[str]] = ( schema.dataset.metadata.tabular, diff --git a/google/cloud/aiplatform/datasets/text_dataset.py b/google/cloud/aiplatform/datasets/text_dataset.py index 6f6fd57bda..85676ed2ed 100644 --- a/google/cloud/aiplatform/datasets/text_dataset.py +++ b/google/cloud/aiplatform/datasets/text_dataset.py @@ -27,7 +27,7 @@ class TextDataset(datasets._Dataset): - """Managed text dataset resource for AI Platform.""" + """Managed text dataset resource for Vertex AI.""" _supported_metadata_schema_uris: Optional[Tuple[str]] = ( schema.dataset.metadata.text, diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py index 92d8e60c37..d5aa3dcbf2 100644 --- a/google/cloud/aiplatform/datasets/time_series_dataset.py +++ b/google/cloud/aiplatform/datasets/time_series_dataset.py @@ -27,7 +27,7 @@ class TimeSeriesDataset(datasets._Dataset): - """Managed time series dataset resource for AI Platform""" + """Managed time series dataset resource for Vertex AI""" _supported_metadata_schema_uris: Optional[Tuple[str]] = ( schema.dataset.metadata.time_series, diff --git a/google/cloud/aiplatform/datasets/video_dataset.py b/google/cloud/aiplatform/datasets/video_dataset.py index 7064c8b7cf..594a4ac407 100644 --- a/google/cloud/aiplatform/datasets/video_dataset.py +++ b/google/cloud/aiplatform/datasets/video_dataset.py @@ -27,7 +27,7 @@ class VideoDataset(datasets._Dataset): - """Managed video dataset resource for AI Platform.""" + """Managed video dataset resource for Vertex AI.""" _supported_metadata_schema_uris: Optional[Tuple[str]] = ( schema.dataset.metadata.video, diff --git a/google/cloud/aiplatform/hyperparameter_tuning.py b/google/cloud/aiplatform/hyperparameter_tuning.py new file mode 100644 index 0000000000..a7a0e641cd --- /dev/null +++ b/google/cloud/aiplatform/hyperparameter_tuning.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import abc +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import proto + +from google.cloud.aiplatform.compat.types import study as gca_study_compat + +_SCALE_TYPE_MAP = { + "linear": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + "log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, + "reverse_log": gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_REVERSE_LOG_SCALE, + "unspecified": gca_study_compat.StudySpec.ParameterSpec.ScaleType.SCALE_TYPE_UNSPECIFIED, +} + + +class _ParameterSpec(metaclass=abc.ABCMeta): + """Base class represents a single parameter to optimize.""" + + def __init__( + self, + conditional_parameter_spec: Optional[Dict[str, "_ParameterSpec"]] = None, + parent_values: Optional[List[Union[float, int, str]]] = None, + ): + + self.conditional_parameter_spec = conditional_parameter_spec + self.parent_values = parent_values + + @property + @classmethod + @abc.abstractmethod + def _proto_parameter_value_class(self) -> proto.Message: + """The proto representation of this parameter.""" + pass + + @property + @classmethod + @abc.abstractmethod + def _parameter_value_map(self) -> Tuple[Tuple[str, str]]: + """A Tuple map of parameter key to underlying proto key.""" + pass + + @property + @classmethod + @abc.abstractmethod + def _parameter_spec_value_key(self) -> Tuple[Tuple[str, str]]: + """The ParameterSpec key this parameter should be assigned.""" + pass + + @property + def _proto_parameter_value_spec(self) -> proto.Message: + """Converts this parameter to it's parameter value representation.""" + proto_parameter_value_spec = self._proto_parameter_value_class() + for self_attr_key, proto_attr_key in self._parameter_value_map: + setattr( + proto_parameter_value_spec, proto_attr_key, getattr(self, self_attr_key) + ) + return proto_parameter_value_spec + + def _to_parameter_spec( + self, parameter_id: str + ) -> gca_study_compat.StudySpec.ParameterSpec: + """Converts this parameter to ParameterSpec.""" + # TODO: Conditional parameters + parameter_spec = gca_study_compat.StudySpec.ParameterSpec( + parameter_id=parameter_id, + scale_type=_SCALE_TYPE_MAP.get(getattr(self, "scale", "unspecified")), + ) + + setattr( + parameter_spec, + self._parameter_spec_value_key, + self._proto_parameter_value_spec, + ) + + return parameter_spec + + +class DoubleParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec + ) + _parameter_value_map = (("min", "min_value"), ("max", "max_value")) + _parameter_spec_value_key = "double_value_spec" + + def __init__( + self, min: float, max: float, scale: str, + ): + """ + Value specification for a parameter in ``DOUBLE`` type. + + Args: + min (float): + Required. Inclusive minimum value of the + parameter. + max (float): + Required. Inclusive maximum value of the + parameter. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ + + super().__init__() + + self.min = min + self.max = max + self.scale = scale + + +class IntegerParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec + ) + _parameter_value_map = (("min", "min_value"), ("max", "max_value")) + _parameter_spec_value_key = "integer_value_spec" + + def __init__( + self, min: int, max: int, scale: str, + ): + """ + Value specification for a parameter in ``INTEGER`` type. + + Args: + min (float): + Required. Inclusive minimum value of the + parameter. + max (float): + Required. Inclusive maximum value of the + parameter. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ + + super().__init__() + + self.min = min + self.max = max + self.scale = scale + + +class CategoricalParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec + ) + _parameter_value_map = (("values", "values"),) + _parameter_spec_value_key = "categorical_value_spec" + + def __init__( + self, values: Sequence[str], + ): + """Value specification for a parameter in ``CATEGORICAL`` type. + + Args: + values (Sequence[str]): + Required. The list of possible categories. + """ + + super().__init__() + + self.values = values + + +class DiscreteParameterSpec(_ParameterSpec): + + _proto_parameter_value_class = ( + gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec + ) + _parameter_value_map = (("values", "values"),) + _parameter_spec_value_key = "discrete_value_spec" + + def __init__( + self, values: Sequence[float], scale: str, + ): + """Value specification for a parameter in ``DISCRETE`` type. + + values (Sequence[float]): + Required. A list of possible values. + The list should be in increasing order and at + least 1e-10 apart. For instance, this parameter + might have possible settings of 1.5, 2.5, and + 4.0. This list should not contain more than + 1,000 values. + scale (str): + Required. The type of scaling that should be applied to this parameter. + + Accepts: 'linear', 'log', 'reverse_log' + """ + + super().__init__() + + self.values = values + self.scale = scale diff --git a/google/cloud/aiplatform/initializer.py b/google/cloud/aiplatform/initializer.py index 9adae3be9a..18341bde46 100644 --- a/google/cloud/aiplatform/initializer.py +++ b/google/cloud/aiplatform/initializer.py @@ -247,23 +247,23 @@ def common_location_path( def create_client( self, - client_class: Type[utils.AiPlatformServiceClientWithOverride], + client_class: Type[utils.VertexAiServiceClientWithOverride], credentials: Optional[auth_credentials.Credentials] = None, location_override: Optional[str] = None, prediction_client: bool = False, - ) -> utils.AiPlatformServiceClientWithOverride: - """Instantiates a given AiPlatformServiceClient with optional + ) -> utils.VertexAiServiceClientWithOverride: + """Instantiates a given VertexAiServiceClient with optional overrides. Args: - client_class (utils.AiPlatformServiceClientWithOverride): - (Required) An AI Platform Service Client with optional overrides. + client_class (utils.VertexAiServiceClientWithOverride): + (Required) An Vertex AI Service Client with optional overrides. credentials (auth_credentials.Credentials): Custom auth credentials. If not provided will use the current config. location_override (str): Optional location override. prediction_client (str): Optional flag to use a prediction endpoint. Returns: - client: Instantiated AI Platform Service client with optional overrides + client: Instantiated Vertex AI Service client with optional overrides """ gapic_version = pkg_resources.get_distribution( "google-cloud-aiplatform", diff --git a/google/cloud/aiplatform/jobs.py b/google/cloud/aiplatform/jobs.py index ee6d46dde9..c37530a78f 100644 --- a/google/cloud/aiplatform/jobs.py +++ b/google/cloud/aiplatform/jobs.py @@ -18,6 +18,7 @@ from typing import Iterable, Optional, Union, Sequence, Dict, List import abc +import copy import sys import time import logging @@ -26,25 +27,34 @@ from google.cloud import bigquery from google.auth import credentials as auth_credentials +from google.protobuf import duration_pb2 # type: ignore from google.cloud import aiplatform from google.cloud.aiplatform import base -from google.cloud.aiplatform import initializer from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants +from google.cloud.aiplatform import initializer +from google.cloud.aiplatform import hyperparameter_tuning from google.cloud.aiplatform import utils +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud.aiplatform.compat.services import job_service_client from google.cloud.aiplatform.compat.types import ( - io as gca_io_compat, - io_v1beta1 as gca_io_v1beta1, - job_state as gca_job_state, batch_prediction_job as gca_bp_job_compat, batch_prediction_job_v1 as gca_bp_job_v1, batch_prediction_job_v1beta1 as gca_bp_job_v1beta1, + custom_job as gca_custom_job_compat, + custom_job_v1beta1 as gca_custom_job_v1beta1, + explanation_v1beta1 as gca_explanation_v1beta1, + io as gca_io_compat, + io_v1beta1 as gca_io_v1beta1, + job_state as gca_job_state, + hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, + hyperparameter_tuning_job_v1beta1 as gca_hyperparameter_tuning_job_v1beta1, machine_resources as gca_machine_resources_compat, machine_resources_v1beta1 as gca_machine_resources_v1beta1, - explanation_v1beta1 as gca_explanation_v1beta1, + study as gca_study_compat, ) logging.basicConfig(level=logging.INFO, stream=sys.stdout) @@ -63,8 +73,8 @@ ) -class _Job(base.AiPlatformResourceNounWithFutureManager): - """Class that represents a general Job resource in AI Platform (Unified). +class _Job(base.VertexAiResourceNounWithFutureManager): + """Class that represents a general Job resource in Vertex AI. Cannot be directly instantiated. Serves as base class to specific Job types, i.e. BatchPredictionJob or @@ -120,7 +130,7 @@ def state(self) -> gca_job_state.JobState: Returns: state (job_state.JobState): - Enum that describes the state of a AI Platform job. + Enum that describes the state of a Vertex AI job. """ # Fetch the Job again for most up-to-date job state @@ -173,15 +183,23 @@ def _block_until_complete(self): ) ) log_wait = min(log_wait * multiplier, max_wait) - previous_time = current_time + previous_time = current_time time.sleep(wait) - _LOGGER.log_action_completed_against_resource("", "run", self) - + _LOGGER.info( + "%s %s current state:\n%s" + % ( + self.__class__.__name__, + self._gca_resource.name, + self._gca_resource.state, + ) + ) # Error is only populated when the job state is # JOB_STATE_FAILED or JOB_STATE_CANCELLED. - if self.state in _JOB_ERROR_STATES: + if self._gca_resource.state in _JOB_ERROR_STATES: raise RuntimeError("Job failed with:\n%s" % self._gca_resource.error) + else: + _LOGGER.log_action_completed_against_resource("run", "completed", self) @classmethod def list( @@ -191,7 +209,7 @@ def list( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List[base.AiPlatformResourceNoun]: + ) -> List[base.VertexAiResourceNoun]: """List all instances of this Job Resource. Example Usage: @@ -219,7 +237,7 @@ def list( credentials set in aiplatform.init. Returns: - List[AiPlatformResourceNoun] - A list of Job resource objects + List[VertexAiResourceNoun] - A list of Job resource objects """ return cls._list_with_local_order( @@ -329,7 +347,7 @@ def create( or "file-list". Default is "jsonl" when using `gcs_source`. If a `bigquery_source` is provided, this is overriden to "bigquery". predictions_format (str): - Required. The format in which AI Platform gives the + Required. The format in which Vertex AI gives the predictions, must be one of "jsonl", "csv", or "bigquery". Default is "jsonl" when using `gcs_destination_prefix`. If a `bigquery_destination_prefix` is provided, this is overriden to @@ -399,7 +417,7 @@ def create( `machine_type`. Only used if `machine_type` is set. starting_replica_count (Optional[int]): The number of machine replicas used at the start of the batch - operation. If not set, AI Platform decides starting number, not + operation. If not set, Vertex AI decides starting number, not greater than `max_replica_count`. Only used if `machine_type` is set. max_replica_count (Optional[int]): @@ -629,7 +647,7 @@ def _create( Required. An instance of DatasetServiceClient with the correct api_endpoint already set based on user's preferences. batch_prediction_job (gca_bp_job.BatchPredictionJob): - Required. a batch prediction job proto for creating a batch prediction job on AI Platform. + Required. a batch prediction job proto for creating a batch prediction job on Vertex AI. generate_explanation (bool): Required. Generate explanation along with the batch prediction results. @@ -655,7 +673,7 @@ def _create( ValueError: If no or multiple source or destinations are provided. Also, if provided instances_format or predictions_format are not supported - by AI Platform. + by Vertex AI. """ # select v1beta1 if explain else use default v1 if generate_explanation: @@ -768,14 +786,89 @@ def iter_outputs( ) -class CustomJob(_Job): - _resource_noun = "customJobs" - _getter_method = "get_custom_job" - _list_method = "list_custom_job" - _cancel_method = "cancel_custom_job" - _delete_method = "delete_custom_job" - _job_type = "training" - pass +class _RunnableJob(_Job): + """ABC to interface job as a runnable training class.""" + + def __init__( + self, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + ): + """Initializes job with project, location, and api_client. + + Args: + project(str): Project of the resource noun. + location(str): The location of the resource noun. + credentials(google.auth.crendentials.Crendentials): Optional custom + credentials to use when accessing interacting with resource noun. + """ + + base.VertexAiResourceNounWithFutureManager.__init__( + self, project=project, location=location, credentials=credentials + ) + + self._parent = aiplatform.initializer.global_config.common_location_path( + project=project, location=location + ) + + @abc.abstractmethod + def run(self) -> None: + pass + + @property + def _has_run(self) -> bool: + """Property returns true if this class has a resource name.""" + return bool(self._gca_resource.name) + + @property + def state(self) -> gca_job_state.JobState: + """Current state of job. + + Raises: + RuntimeError if job run has not been called. + """ + if not self._has_run: + raise RuntimeError("Job has not run. No state available.") + + return super().state + + @classmethod + def get( + cls, + resource_name: str, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + ) -> "_RunnableJob": + """Get an Vertex AI Job for the given resource_name. + + Args: + resource_name (str): + Required. A fully-qualified resource name or ID. + project (str): + Optional project to retrieve dataset from. If not set, project + set in aiplatform.init will be used. + location (str): + Optional location to retrieve dataset from. If not set, location + set in aiplatform.init will be used. + credentials (auth_credentials.Credentials): + Custom credentials to use to upload this model. Overrides + credentials set in aiplatform.init. + + Returns: + An Vertex AI Job. + """ + self = cls._empty_constructor( + project=project, + location=location, + credentials=credentials, + resource_name=resource_name, + ) + + self._gca_resource = self._get_gca_resource(resource_name=resource_name) + + return self class DataLabelingJob(_Job): @@ -788,10 +881,648 @@ class DataLabelingJob(_Job): pass -class HyperparameterTuningJob(_Job): +class CustomJob(_RunnableJob): + """Vertex AI Custom Job.""" + + _resource_noun = "customJobs" + _getter_method = "get_custom_job" + _list_method = "list_custom_job" + _cancel_method = "cancel_custom_job" + _delete_method = "delete_custom_job" + _job_type = "training" + + def __init__( + self, + display_name: str, + worker_pool_specs: Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]], + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + staging_bucket: Optional[str] = None, + ): + """Cosntruct a Custom Job with Worker Pool Specs. + + ``` + Example usage: + worker_pool_specs = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": container_image_uri, + "command": [], + "args": [], + }, + } + ] + + my_job = aiplatform.CustomJob( + display_name='my_job', + worker_pool_specs=worker_pool_specs + ) + + my_job.run() + ``` + + + For more information on configuring worker pool specs please visit: + https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job + + + Args: + display_name (str): + Required. The user-defined name of the HyperparameterTuningJob. + The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + worker_pool_specs (Union[List[Dict], List[aiplatform.gapic.WorkerPoolSpec]]): + Required. The spec of the worker pools including machine type and Docker image. + Can provided as a list of dictionaries or list of WorkerPoolSpec proto messages. + project (str): + Optional.Project to run the custom job in. Overrides project set in aiplatform.init. + location (str): + Optional.Location to run the custom job in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional.Custom credentials to use to run call custom job service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Optional.Customer-managed encryption key name for a + CustomJob. If this is set, then all resources + created by the CustomJob will be encrypted with + the provided encryption key. + staging_bucket (str): + Optional. Bucket for produced custom job artifacts. Overrides + staging_bucket set in aiplatform.init. + + Raises: + RuntimeError is not staging bucket was set using aiplatfrom.init and a staging + bucket was not passed in. + """ + + super().__init__(project=project, location=location, credentials=credentials) + + staging_bucket = staging_bucket or initializer.global_config.staging_bucket + + if not staging_bucket: + raise RuntimeError( + "staging_bucket should be passed to CustomJob constructor or " + "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" + ) + + self._gca_resource = gca_custom_job_compat.CustomJob( + display_name=display_name, + job_spec=gca_custom_job_compat.CustomJobSpec( + worker_pool_specs=worker_pool_specs, + base_output_directory=gca_io_compat.GcsDestination( + output_uri_prefix=staging_bucket + ), + ), + encryption_spec=initializer.global_config.get_encryption_spec( + encryption_spec_key_name=encryption_spec_key_name + ), + ) + + @classmethod + def from_local_script( + cls, + display_name: str, + script_path: str, + container_uri: str, + args: Optional[List[Union[str, float, int]]] = None, + requirements: Optional[Sequence[str]] = None, + environment_variables: Optional[Dict[str, str]] = None, + replica_count: int = 1, + machine_type: str = "n1-standard-4", + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + accelerator_count: int = 0, + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + staging_bucket: Optional[str] = None, + ) -> "CustomJob": + """Configures a custom job from a local script. + + Example usage: + ``` + job = aiplatform.CustomJob.from_local_script( + display_name="my-custom-job", + script_path="training_script.py", + container_uri="gcr.io/cloud-aiplatform/training/tf-cpu.2-2:latest", + requirements=["gcsfs==0.7.1"], + replica_count=1, + args=['--dataset', 'gs://my-bucket/my-dataset', + '--model_output_uri', 'gs://my-bucket/model'] + ) + + job.run() + ``` + + Args: + display_name (str): + Required. The user-defined name of this CustomJob. + script_path (str): + Required. Local path to training script. + container_uri (str): + Required: Uri of the training container image to use for custom job. + args (Optional[List[Union[str, float, int]]]): + Optional. Command line arguments to be passed to the Python task. + requirements (Sequence[str]): + Optional. List of python packages dependencies of script. + environment_variables (Dict[str, str]): + Optional. Environment variables to be passed to the container. + Should be a dictionary where keys are environment variable names + and values are environment variable values for those names. + At most 10 environment variables can be specified. + The Name of the environment variable must be unique. + + environment_variables = { + 'MY_KEY': 'MY_VALUE' + } + replica_count (int): + Optional. The number of worker replicas. If replica count = 1 then one chief + replica will be provisioned. If replica_count > 1 the remainder will be + provisioned as a worker replica pool. + machine_type (str): + Optional. The type of machine to use for training. + accelerator_type (str): + Optional. Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, + NVIDIA_TESLA_T4 + accelerator_count (int): + Optional. The number of accelerators to attach to a worker replica. + project (str): + Optional. Project to run the custom job in. Overrides project set in aiplatform.init. + location (str): + Optional. Location to run the custom job in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional. Custom credentials to use to run call custom job service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Optional. Customer-managed encryption key name for a + CustomJob. If this is set, then all resources + created by the CustomJob will be encrypted with + the provided encryption key. + staging_bucket (str): + Optional. Bucket for produced custom job artifacts. Overrides + staging_bucket set in aiplatform.init. + + Raises: + RuntimeError is not staging bucket was set using aiplatfrom.init and a staging + bucket was not passed in. + """ + + project = project or initializer.global_config.project + location = location or initializer.global_config.location + staging_bucket = staging_bucket or initializer.global_config.staging_bucket + + if not staging_bucket: + raise RuntimeError( + "staging_bucket should be passed to CustomJob.from_local_script or " + "should be set using aiplatform.init(staging_bucket='gs://my-bucket')" + ) + + worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( + replica_count=replica_count, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ).pool_specs + + python_packager = source_utils._TrainingScriptPythonPackager( + script_path=script_path, requirements=requirements + ) + + package_gcs_uri = python_packager.package_and_copy_to_gcs( + gcs_staging_dir=staging_bucket, project=project, credentials=credentials, + ) + + for spec in worker_pool_specs: + spec["python_package_spec"] = { + "executor_image_uri": container_uri, + "python_module": python_packager.module_name, + "package_uris": [package_gcs_uri], + } + + if args: + spec["python_package_spec"]["args"] = args + + if environment_variables: + spec["python_package_spec"]["env"] = [ + {"name": key, "value": value} + for key, value in environment_variables.items() + ] + + return cls( + display_name=display_name, + worker_pool_specs=worker_pool_specs, + project=project, + location=location, + credentials=credentials, + encryption_spec_key_name=encryption_spec_key_name, + staging_bucket=staging_bucket, + ) + + @base.optional_sync() + def run( + self, + service_account: Optional[str] = None, + network: Optional[str] = None, + timeout: Optional[int] = None, + restart_job_on_worker_restart: bool = False, + tensorboard: Optional[str] = None, + sync: bool = True, + ) -> None: + """Run this configured CustomJob. + + Args: + service_account (str): + Optional. Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + Optional. The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + timeout (int): + The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + tensorboard (str): + Optional. The name of an Vertex AI + [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] + resource to which this CustomJob will upload Tensorboard + logs. Format: + ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` + + The training script should write Tensorboard to following Vertex AI environment + variable: + + AIP_TENSORBOARD_LOG_DIR + + `service_account` is required with provided `tensorboard`. + For more information on configuring your service account please visit: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + sync (bool): + Whether to execute this method synchronously. If False, this method + will unblock and it will be executed in a concurrent Future. + """ + + if service_account: + self._gca_resource.job_spec.service_account = service_account + + if network: + self._gca_resource.job_spec.network = network + + if timeout or restart_job_on_worker_restart: + timeout = duration_pb2.Duration(seconds=timeout) if timeout else None + self._gca_resource.job_spec.scheduling = gca_custom_job_compat.Scheduling( + timeout=timeout, + restart_job_on_worker_restart=restart_job_on_worker_restart, + ) + + if tensorboard: + v1beta1_gca_resource = gca_custom_job_v1beta1.CustomJob() + v1beta1_gca_resource._pb.MergeFromString( + self._gca_resource._pb.SerializeToString() + ) + self._gca_resource = v1beta1_gca_resource + self._gca_resource.job_spec.tensorboard = tensorboard + + _LOGGER.log_create_with_lro(self.__class__) + + version = "v1beta1" if tensorboard else "v1" + self._gca_resource = self.api_client.select_version(version).create_custom_job( + parent=self._parent, custom_job=self._gca_resource + ) + + _LOGGER.log_create_complete_with_getter( + self.__class__, self._gca_resource, "custom_job" + ) + + _LOGGER.info("View Custom Job:\n%s" % self._dashboard_uri()) + + self._block_until_complete() + + @property + def job_spec(self): + return self._gca_resource.job_spec + + +_SEARCH_ALGORITHM_TO_PROTO_VALUE = { + "random": gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, + "grid": gca_study_compat.StudySpec.Algorithm.GRID_SEARCH, + None: gca_study_compat.StudySpec.Algorithm.ALGORITHM_UNSPECIFIED, +} + +_MEASUREMENT_SELECTION_TO_PROTO_VALUE = { + "best": gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, + "last": gca_study_compat.StudySpec.MeasurementSelectionType.LAST_MEASUREMENT, +} + + +class HyperparameterTuningJob(_RunnableJob): + """Vertex AI Hyperparameter Tuning Job.""" + _resource_noun = "hyperparameterTuningJobs" _getter_method = "get_hyperparameter_tuning_job" _list_method = "list_hyperparameter_tuning_jobs" _cancel_method = "cancel_hyperparameter_tuning_job" _delete_method = "delete_hyperparameter_tuning_job" - pass + _job_type = "training" + + def __init__( + self, + display_name: str, + custom_job: CustomJob, + metric_spec: Dict[str, str], + parameter_spec: Dict[str, hyperparameter_tuning._ParameterSpec], + max_trial_count: int, + parallel_trial_count: int, + max_failed_trial_count: int = 0, + search_algorithm: Optional[str] = None, + measurement_selection: Optional[str] = "best", + project: Optional[str] = None, + location: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, + encryption_spec_key_name: Optional[str] = None, + ): + """ + Configures a HyperparameterTuning Job. + + Example usage: + + ``` + from google.cloud.aiplatform import hyperparameter_tuning as hpt + + worker_pool_specs = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": container_image_uri, + "command": [], + "args": [], + }, + } + ] + + custom_job = aiplatform.CustomJob( + display_name='my_job', + worker_pool_specs=worker_pool_specs + ) + + + hp_job = aiplatform.HyperparameterTuningJob( + display_name='hp-test', + custom_job=job, + metric_spec={ + 'loss': 'minimize', + }, + parameter_spec={ + 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), + 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), + 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), + 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') + }, + max_trial_count=128, + parallel_trial_count=8, + ) + + hp_job.run() + + print(hp_job.trials) + ``` + + + For more information on using hyperparameter tuning please visit: + https://cloud.google.com/ai-platform-unified/docs/training/using-hyperparameter-tuning + + Args: + display_name (str): + Required. The user-defined name of the HyperparameterTuningJob. + The name can be up to 128 characters long and can be consist + of any UTF-8 characters. + custom_job (aiplatform.CustomJob): + Required. Configured CustomJob. The worker pool spec from this custom job + applies to the CustomJobs created in all the trials. + metric_spec: Dict[str, str] + Required. Dicionary representing metrics to optimize. The dictionary key is the metric_id, + which is reported by your training job, and the dictionary value is the + optimization goal of the metric('minimize' or 'maximize'). example: + + metric_spec = {'loss': 'minimize', 'accuracy': 'maximize'} + + parameter_spec (Dict[str, hyperparameter_tuning._ParameterSpec]): + Required. Dictionary representing parameters to optimize. The dictionary key is the metric_id, + which is passed into your training job as a command line key word arguemnt, and the + dictionary value is the parameter specification of the metric. + + + from google.cloud.aiplatform import hyperparameter_tuning as hpt + + parameter_spec={ + 'decay': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear'), + 'learning_rate': hpt.DoubleParameterSpec(min=1e-7, max=1, scale='linear') + 'batch_size': hpt.DiscreteParamterSpec(values=[4, 8, 16, 32, 64, 128], scale='linear') + } + + Supported parameter specifications can be found until aiplatform.hyperparameter_tuning. + These parameter specification are currently supported: + DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec + + max_trial_count (int): + Reuired. The desired total number of Trials. + parallel_trial_count (int): + Required. The desired number of Trials to run in parallel. + max_failed_trial_count (int): + Optional. The number of failed Trials that need to be + seen before failing the HyperparameterTuningJob. + If set to 0, Vertex AI decides how many Trials + must fail before the whole job fails. + search_algorithm (str): + The search algorithm specified for the Study. + Accepts one of the following: + `None` - If you do not specify an algorithm, your job uses + the default Vertex AI algorithm. The default algorithm + applies Bayesian optimization to arrive at the optimal + solution with a more effective search over the parameter space. + + 'grid' - A simple grid search within the feasible space. This + option is particularly useful if you want to specify a quantity + of trials that is greater than the number of points in the + feasible space. In such cases, if you do not specify a grid + search, the Vertex AI default algorithm may generate duplicate + suggestions. To use grid search, all parameter specs must be + of type `IntegerParameterSpec`, `CategoricalParameterSpace`, + or `DiscreteParameterSpec`. + + 'random' - A simple random search within the feasible space. + measurement_selection (str): + This indicates which measurement to use if/when the service + automatically selects the final measurement from previously reported + intermediate measurements. + + Accepts: 'best', 'last' + + Choose this based on two considerations: + A) Do you expect your measurements to monotonically improve? If so, + choose 'last'. On the other hand, if you're in a situation + where your system can "over-train" and you expect the performance to + get better for a while but then start declining, choose + 'best'. B) Are your measurements significantly noisy + and/or irreproducible? If so, 'best' will tend to be + over-optimistic, and it may be better to choose 'last'. If + both or neither of (A) and (B) apply, it doesn't matter which + selection type is chosen. + project (str): + Optional. Project to run the HyperparameterTuningjob in. Overrides project set in aiplatform.init. + location (str): + Optional. Location to run the HyperparameterTuning in. Overrides location set in aiplatform.init. + credentials (auth_credentials.Credentials): + Optional. Custom credentials to use to run call HyperparameterTuning service. Overrides + credentials set in aiplatform.init. + encryption_spec_key_name (str): + Optional. Customer-managed encryption key options for a + HyperparameterTuningJob. If this is set, then + all resources created by the + HyperparameterTuningJob will be encrypted with + the provided encryption key. + """ + super().__init__(project=project, location=location, credentials=credentials) + + metrics = [ + gca_study_compat.StudySpec.MetricSpec( + metric_id=metric_id, goal=goal.upper() + ) + for metric_id, goal in metric_spec.items() + ] + + parameters = [ + parameter._to_parameter_spec(parameter_id=parameter_id) + for parameter_id, parameter in parameter_spec.items() + ] + + study_spec = gca_study_compat.StudySpec( + metrics=metrics, + parameters=parameters, + algorithm=_SEARCH_ALGORITHM_TO_PROTO_VALUE[search_algorithm], + measurement_selection_type=_MEASUREMENT_SELECTION_TO_PROTO_VALUE[ + measurement_selection + ], + ) + + self._gca_resource = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( + display_name=display_name, + study_spec=study_spec, + max_trial_count=max_trial_count, + parallel_trial_count=parallel_trial_count, + max_failed_trial_count=max_failed_trial_count, + trial_job_spec=copy.deepcopy(custom_job.job_spec), + encryption_spec=initializer.global_config.get_encryption_spec( + encryption_spec_key_name=encryption_spec_key_name + ), + ) + + @base.optional_sync() + def run( + self, + service_account: Optional[str] = None, + network: Optional[str] = None, + timeout: Optional[int] = None, # seconds + restart_job_on_worker_restart: bool = False, + tensorboard: Optional[str] = None, + sync: bool = True, + ) -> None: + """Run this configured CustomJob. + + Args: + service_account (str): + Optional. Specifies the service account for workload run-as account. + Users submitting jobs must have act-as permission on this run-as account. + network (str): + Optional. The full name of the Compute Engine network to which the job + should be peered. For example, projects/12345/global/networks/myVPC. + Private services access must already be configured for the network. + If left unspecified, the job is not peered with any network. + timeout (int): + Optional. The maximum job running time in seconds. The default is 7 days. + restart_job_on_worker_restart (bool): + Restarts the entire CustomJob if a worker + gets restarted. This feature can be used by + distributed training jobs that are not resilient + to workers leaving and joining a job. + tensorboard (str): + Optional. The name of an Vertex AI + [Tensorboard][google.cloud.aiplatform.v1beta1.Tensorboard] + resource to which this CustomJob will upload Tensorboard + logs. Format: + ``projects/{project}/locations/{location}/tensorboards/{tensorboard}`` + + The training script should write Tensorboard to following Vertex AI environment + variable: + + AIP_TENSORBOARD_LOG_DIR + + `service_account` is required with provided `tensorboard`. + For more information on configuring your service account please visit: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + sync (bool): + Whether to execute this method synchronously. If False, this method + will unblock and it will be executed in a concurrent Future. + """ + + if service_account: + self._gca_resource.trial_job_spec.service_account = service_account + + if network: + self._gca_resource.trial_job_spec.network = network + + if timeout or restart_job_on_worker_restart: + duration = duration_pb2.Duration(seconds=timeout) if timeout else None + self._gca_resource.trial_job_spec.scheduling = gca_custom_job_compat.Scheduling( + timeout=duration, + restart_job_on_worker_restart=restart_job_on_worker_restart, + ) + + if tensorboard: + v1beta1_gca_resource = ( + gca_hyperparameter_tuning_job_v1beta1.HyperparameterTuningJob() + ) + v1beta1_gca_resource._pb.MergeFromString( + self._gca_resource._pb.SerializeToString() + ) + self._gca_resource = v1beta1_gca_resource + self._gca_resource.trial_job_spec.tensorboard = tensorboard + + _LOGGER.log_create_with_lro(self.__class__) + + version = "v1beta1" if tensorboard else "v1" + self._gca_resource = self.api_client.select_version( + version + ).create_hyperparameter_tuning_job( + parent=self._parent, hyperparameter_tuning_job=self._gca_resource + ) + + _LOGGER.log_create_complete_with_getter( + self.__class__, self._gca_resource, "hpt_job" + ) + + _LOGGER.info("View HyperparameterTuningJob:\n%s" % self._dashboard_uri()) + + self._block_until_complete() + + @property + def trials(self) -> List[gca_study_compat.Trial]: + return list(self._gca_resource.trials) diff --git a/google/cloud/aiplatform/metadata/artifact.py b/google/cloud/aiplatform/metadata/artifact.py index 98eefacc5f..b3ef6e09a2 100644 --- a/google/cloud/aiplatform/metadata/artifact.py +++ b/google/cloud/aiplatform/metadata/artifact.py @@ -26,7 +26,7 @@ class _Artifact(_Resource): - """Metadata Artifact resource for AI Platform""" + """Metadata Artifact resource for Vertex AI""" _resource_noun = "artifacts" _getter_method = "get_artifact" diff --git a/google/cloud/aiplatform/metadata/context.py b/google/cloud/aiplatform/metadata/context.py index cb3340499b..ddd583bbdf 100644 --- a/google/cloud/aiplatform/metadata/context.py +++ b/google/cloud/aiplatform/metadata/context.py @@ -26,7 +26,7 @@ class _Context(_Resource): - """Metadata Context resource for AI Platform""" + """Metadata Context resource for Vertex AI""" _resource_noun = "contexts" _getter_method = "get_context" diff --git a/google/cloud/aiplatform/metadata/execution.py b/google/cloud/aiplatform/metadata/execution.py index 39fc7a74b3..3605efdb4f 100644 --- a/google/cloud/aiplatform/metadata/execution.py +++ b/google/cloud/aiplatform/metadata/execution.py @@ -29,7 +29,7 @@ class _Execution(_Resource): - """Metadata Execution resource for AI Platform""" + """Metadata Execution resource for Vertex AI""" _resource_noun = "executions" _getter_method = "get_execution" diff --git a/google/cloud/aiplatform/metadata/metadata_store.py b/google/cloud/aiplatform/metadata/metadata_store.py index 2a55f066a8..494d31aca4 100644 --- a/google/cloud/aiplatform/metadata/metadata_store.py +++ b/google/cloud/aiplatform/metadata/metadata_store.py @@ -27,8 +27,8 @@ from google.cloud.aiplatform_v1beta1.types import metadata_store as gca_metadata_store -class _MetadataStore(base.AiPlatformResourceNounWithFutureManager): - """Managed MetadataStore resource for AI Platform""" +class _MetadataStore(base.VertexAiResourceNounWithFutureManager): + """Managed MetadataStore resource for Vertex AI""" client_class = utils.MetadataClientWithOverride _is_client_prediction_client = False diff --git a/google/cloud/aiplatform/metadata/resource.py b/google/cloud/aiplatform/metadata/resource.py index 11f03b7af1..85ac419d40 100644 --- a/google/cloud/aiplatform/metadata/resource.py +++ b/google/cloud/aiplatform/metadata/resource.py @@ -33,8 +33,8 @@ from google.cloud.aiplatform_v1beta1 import Execution as GapicExecution -class _Resource(base.AiPlatformResourceNounWithFutureManager, abc.ABC): - """Metadata Resource for AI Platform""" +class _Resource(base.VertexAiResourceNounWithFutureManager, abc.ABC): + """Metadata Resource for Vertex AI""" client_class = utils.MetadataClientWithOverride _is_client_prediction_client = False diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py index cecc992644..b93f569eaa 100644 --- a/google/cloud/aiplatform/models.py +++ b/google/cloud/aiplatform/models.py @@ -73,7 +73,7 @@ class Prediction(NamedTuple): explanations: Optional[Sequence[gca_explanation_v1beta1.Explanation]] = None -class Endpoint(base.AiPlatformResourceNounWithFutureManager): +class Endpoint(base.VertexAiResourceNounWithFutureManager): client_class = utils.EndpointClientWithOverride _is_client_prediction_client = False @@ -1181,7 +1181,7 @@ def undeploy_all(self, sync: bool = True) -> "Endpoint": return self def delete(self, force: bool = False, sync: bool = True) -> None: - """Deletes this AI Platform Endpoint resource. If force is set to True, + """Deletes this Vertex AI Endpoint resource. If force is set to True, all models on this Endpoint will be undeployed prior to deletion. Args: @@ -1201,7 +1201,7 @@ def delete(self, force: bool = False, sync: bool = True) -> None: super().delete(sync=sync) -class Model(base.AiPlatformResourceNounWithFutureManager): +class Model(base.VertexAiResourceNounWithFutureManager): client_class = utils.ModelClientWithOverride _is_client_prediction_client = False @@ -1325,11 +1325,11 @@ def upload( serving_container_predict_route (str): Optional. An HTTP path to send prediction requests to the container, and which must be supported by it. If not specified a default HTTP path will - be used by AI Platform. + be used by Vertex AI. serving_container_health_route (str): Optional. An HTTP path to send health check requests to the container, and which must be supported by it. If not specified a standard HTTP path will be - used by AI Platform. + used by Vertex AI. description (str): The description of the model. serving_container_command: Optional[Sequence[str]]=None, @@ -1353,7 +1353,7 @@ def upload( and values are environment variable values for those names. serving_container_ports: Optional[Sequence[int]]=None, Declaration of ports that are exposed by the container. This field is - primarily informational, it gives AI Platform information about the + primarily informational, it gives Vertex AI information about the network connections the container uses. Listing or not a port here has no impact on whether the port is actually exposed, any port listening on the default "0.0.0.0" address inside a container will be accessible from @@ -1898,7 +1898,7 @@ def batch_predict( ```google.rpc.Status`` `__ represented as a STRUCT, and containing only ``code`` and ``message``. predictions_format: str = "jsonl" - Required. The format in which AI Platform gives the + Required. The format in which Vertex AI gives the predictions, must be one of "jsonl", "csv", or "bigquery". Default is "jsonl" when using `gcs_destination_prefix`. If a `bigquery_destination_prefix` is provided, this is overriden to @@ -1919,7 +1919,7 @@ def batch_predict( `machine_type`. Only used if `machine_type` is set. starting_replica_count: Optional[int] = None The number of machine replicas used at the start of the batch - operation. If not set, AI Platform decides starting number, not + operation. If not set, Vertex AI decides starting number, not greater than `max_replica_count`. Only used if `machine_type` is set. max_replica_count: Optional[int] = None diff --git a/google/cloud/aiplatform/schema.py b/google/cloud/aiplatform/schema.py index 6b2a3d7d66..a1da75d9e6 100644 --- a/google/cloud/aiplatform/schema.py +++ b/google/cloud/aiplatform/schema.py @@ -15,7 +15,7 @@ # limitations under the License. # -"""Namespaced AI Platform Schemas.""" +"""Namespaced Vertex AI Schemas.""" class training_job: diff --git a/google/cloud/aiplatform/tensorboard/uploader_main.py b/google/cloud/aiplatform/tensorboard/uploader_main.py index 734d647fb4..ebd4aa5147 100644 --- a/google/cloud/aiplatform/tensorboard/uploader_main.py +++ b/google/cloud/aiplatform/tensorboard/uploader_main.py @@ -49,7 +49,7 @@ flags.DEFINE_integer( "event_file_inactive_secs", None, - "Age in seconds of last write after which an event file is considered " "inactive.", + "Age in seconds of last write after which an event file is considered inactive.", ) flags.DEFINE_string( "run_name_prefix", @@ -57,6 +57,16 @@ "If present, all runs created by this invocation will have their name " "prefixed by this value.", ) +flags.DEFINE_string( + "api_uri", + "aiplatform.googleapis.com", + "The API URI for fetching Tensorboard metadata.", +) +flags.DEFINE_string( + "web_server_uri", + "tensorboard.googleusercontent.com", + "The API URI for accessing the Tensorboard UI.", +) flags.DEFINE_multi_string( "allowed_plugins", @@ -79,6 +89,7 @@ def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") + aiplatform.constants.API_BASE_PATH = FLAGS.api_uri m = re.match( "projects/(.*)/locations/(.*)/tensorboards/.*", FLAGS.tensorboard_resource_name ) @@ -131,7 +142,7 @@ def main(argv): print( "View your Tensorboard at https://{}.{}/experiment/{}".format( region, - "tensorboard.googleusercontent.com", + FLAGS.web_server_uri, tb_uploader.get_experiment_resource_name().replace("/", "+"), ) ) @@ -141,8 +152,16 @@ def main(argv): tb_uploader.start_uploading() +def flags_parser(args): + # Plumbs the flags defined in this file to the main module, mostly for the + # console script wrapper tb-gcp-uploader. + for flag in set(flags.FLAGS.get_key_flags_for_module(__name__)): + flags.FLAGS.register_flag_by_module(args[0], flag) + return app.parse_flags_with_usage(args) + + def run_main(): - app.run(main) + app.run(main, flags_parser=flags_parser) if __name__ == "__main__": diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py index 2912806a12..91e061f4ba 100644 --- a/google/cloud/aiplatform/training_jobs.py +++ b/google/cloud/aiplatform/training_jobs.py @@ -15,16 +15,8 @@ # limitations under the License. # -import datetime -import functools -import logging -import pathlib -import shutil -import subprocess -import sys -import tempfile import time -from typing import Callable, Dict, List, Optional, NamedTuple, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union import abc @@ -38,25 +30,25 @@ from google.cloud.aiplatform import utils from google.cloud.aiplatform.compat.types import ( - accelerator_type as gca_accelerator_type, env_var as gca_env_var, io as gca_io, model as gca_model, pipeline_state as gca_pipeline_state, training_pipeline as gca_training_pipeline, ) +from google.cloud.aiplatform.utils import _timestamped_gcs_dir +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud.aiplatform.v1.schema.trainingjob import ( definition_v1 as training_job_inputs, ) -from google.cloud import storage from google.rpc import code_pb2 import proto -logging.basicConfig(level=logging.INFO, stream=sys.stdout) _LOGGER = base.Logger(__name__) _PIPELINE_COMPLETE_STATES = set( @@ -69,7 +61,7 @@ ) -class _TrainingJob(base.AiPlatformResourceNounWithFutureManager): +class _TrainingJob(base.VertexAiResourceNounWithFutureManager): client_class = utils.PipelineClientWithOverride _is_client_prediction_client = False @@ -174,7 +166,7 @@ def get( doesn't match the custom training task definition. Returns: - An AI Platform Training Job + An Vertex AI Training Job """ # Create job with dummy parameters @@ -285,7 +277,7 @@ def _create_input_data_config( gcs_destination_uri_prefix (str): Optional. The Google Cloud Storage location. - The AI Platform environment variables representing Google + The Vertex AI environment variables representing Google Cloud Storage data URIs will always be represented in the Google Cloud Storage wildcard format to support sharded data. @@ -457,14 +449,14 @@ def _run_job( does not support uploading a Model as part of the pipeline. When the Pipeline's state becomes ``PIPELINE_STATE_SUCCEEDED`` and the trained Model had been - uploaded into AI Platform, then the model_to_upload's + uploaded into Vertex AI, then the model_to_upload's resource ``name`` is populated. The Model is always uploaded into the Project and Location in which this pipeline is. gcs_destination_uri_prefix (str): Optional. The Google Cloud Storage location. - The AI Platform environment variables representing Google + The Vertex AI environment variables representing Google Cloud Storage data URIs will always be represented in the Google Cloud Storage wildcard format to support sharded data. @@ -554,7 +546,7 @@ def state(self) -> Optional[gca_pipeline_state.PipelineState]: return self._gca_resource.state def get_model(self, sync=True) -> models.Model: - """AI Platform Model produced by this training, if one was produced. + """Vertex AI Model produced by this training, if one was produced. Args: sync (bool): @@ -563,7 +555,7 @@ def get_model(self, sync=True) -> models.Model: be immediately returned and synced when the Future has completed. Returns: - model: AI Platform Model produced by this training + model: Vertex AI Model produced by this training Raises: RuntimeError: If training failed or if a model was not produced by this training. @@ -577,7 +569,7 @@ def get_model(self, sync=True) -> models.Model: @base.optional_sync() def _force_get_model(self, sync: bool = True) -> models.Model: - """AI Platform Model produced by this training, if one was produced. + """Vertex AI Model produced by this training, if one was produced. Args: sync (bool): @@ -586,7 +578,7 @@ def _force_get_model(self, sync: bool = True) -> models.Model: be immediately returned and synced when the Future has completed. Returns: - model: AI Platform Model produced by this training + model: Vertex AI Model produced by this training Raises: RuntimeError: If training failed or if a model was not produced by this training. @@ -602,7 +594,7 @@ def _get_model(self) -> Optional[models.Model]: """Helper method to get and instantiate the Model to Upload. Returns: - model: AI Platform Model if training succeeded and produced an AI Platform + model: Vertex AI Model if training succeeded and produced an Vertex AI Model. None otherwise. Raises: @@ -717,7 +709,7 @@ def list( project: Optional[str] = None, location: Optional[str] = None, credentials: Optional[auth_credentials.Credentials] = None, - ) -> List["base.AiPlatformResourceNoune"]: + ) -> List["base.VertexAiResourceNoun"]: """List all instances of this TrainingJob resource. Example Usage: @@ -746,7 +738,7 @@ def list( credentials set in aiplatform.init. Returns: - List[AiPlatformResourceNoun] - A list of TrainingJob resource objects + List[VertexAiResourceNoun] - A list of TrainingJob resource objects """ training_job_subclass_filter = ( @@ -780,449 +772,6 @@ def cancel(self) -> None: self.api_client.cancel_training_pipeline(name=self.resource_name) -def _timestamped_gcs_dir(root_gcs_path: str, dir_name_prefix: str) -> str: - """Composes a timestamped GCS directory. - - Args: - root_gcs_path: GCS path to put the timestamped directory. - dir_name_prefix: Prefix to add the timestamped directory. - Returns: - Timestamped gcs directory path in root_gcs_path. - """ - timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") - dir_name = "-".join([dir_name_prefix, timestamp]) - if root_gcs_path.endswith("/"): - root_gcs_path = root_gcs_path[:-1] - gcs_path = "/".join([root_gcs_path, dir_name]) - if not gcs_path.startswith("gs://"): - return "gs://" + gcs_path - return gcs_path - - -def _timestamped_copy_to_gcs( - local_file_path: str, - gcs_dir: str, - project: Optional[str] = None, - credentials: Optional[auth_credentials.Credentials] = None, -) -> str: - """Copies a local file to a GCS path. - - The file copied to GCS is the name of the local file prepended with an - "aiplatform-{timestamp}-" string. - - Args: - local_file_path (str): Required. Local file to copy to GCS. - gcs_dir (str): - Required. The GCS directory to copy to. - project (str): - Project that contains the staging bucket. Default will be used if not - provided. Model Builder callers should pass this in. - credentials (auth_credentials.Credentials): - Custom credentials to use with bucket. Model Builder callers should pass - this in. - Returns: - gcs_path (str): The path of the copied file in gcs. - """ - - gcs_bucket, gcs_blob_prefix = utils.extract_bucket_and_prefix_from_gcs_path(gcs_dir) - - local_file_name = pathlib.Path(local_file_path).name - timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") - blob_path = "-".join(["aiplatform", timestamp, local_file_name]) - - if gcs_blob_prefix: - blob_path = "/".join([gcs_blob_prefix, blob_path]) - - # TODO(b/171202993) add user agent - client = storage.Client(project=project, credentials=credentials) - bucket = client.bucket(gcs_bucket) - blob = bucket.blob(blob_path) - blob.upload_from_filename(local_file_path) - - gcs_path = "".join(["gs://", "/".join([blob.bucket.name, blob.name])]) - return gcs_path - - -def _get_python_executable() -> str: - """Returns Python executable. - - Returns: - Python executable to use for setuptools packaging. - Raises: - EnvironmentError: If Python executable is not found. - """ - - python_executable = sys.executable - - if not python_executable: - raise EnvironmentError("Cannot find Python executable for packaging.") - return python_executable - - -class _TrainingScriptPythonPackager: - """Converts a Python script into Python package suitable for aiplatform - training. - - Copies the script to specified location. - - Class Attributes: - _TRAINER_FOLDER: Constant folder name to build package. - _ROOT_MODULE: Constant root name of module. - _TEST_MODULE_NAME: Constant name of module that will store script. - _SETUP_PY_VERSION: Constant version of this created python package. - _SETUP_PY_TEMPLATE: Constant template used to generate setup.py file. - _SETUP_PY_SOURCE_DISTRIBUTION_CMD: - Constant command to generate the source distribution package. - - Attributes: - script_path: local path of script to package - requirements: list of Python dependencies to add to package - - Usage: - - packager = TrainingScriptPythonPackager('my_script.py', ['pandas', 'pytorch']) - gcs_path = packager.package_and_copy_to_gcs( - gcs_staging_dir='my-bucket', - project='my-prject') - module_name = packager.module_name - - The package after installed can be executed as: - python -m aiplatform_custom_trainer_script.task - """ - - _TRAINER_FOLDER = "trainer" - _ROOT_MODULE = "aiplatform_custom_trainer_script" - _TASK_MODULE_NAME = "task" - _SETUP_PY_VERSION = "0.1" - - _SETUP_PY_TEMPLATE = """from setuptools import find_packages -from setuptools import setup - -setup( - name='{name}', - version='{version}', - packages=find_packages(), - install_requires=({requirements}), - include_package_data=True, - description='My training application.' -)""" - - _SETUP_PY_SOURCE_DISTRIBUTION_CMD = "setup.py sdist --formats=gztar" - - # Module name that can be executed during training. ie. python -m - module_name = f"{_ROOT_MODULE}.{_TASK_MODULE_NAME}" - - def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = None): - """Initializes packager. - - Args: - script_path (str): Required. Local path to script. - requirements (Sequence[str]): - List of python packages dependencies of script. - """ - - self.script_path = script_path - self.requirements = requirements or [] - - def make_package(self, package_directory: str) -> str: - """Converts script into a Python package suitable for python module - execution. - - Args: - package_directory (str): Directory to build package in. - Returns: - source_distribution_path (str): Path to built package. - Raises: - RunTimeError: If package creation fails. - """ - # The root folder to builder the package in - package_path = pathlib.Path(package_directory) - - # Root directory of the package - trainer_root_path = package_path / self._TRAINER_FOLDER - - # The root module of the python package - trainer_path = trainer_root_path / self._ROOT_MODULE - - # __init__.py path in root module - init_path = trainer_path / "__init__.py" - - # The module that will contain the script - script_out_path = trainer_path / f"{self._TASK_MODULE_NAME}.py" - - # The path to setup.py in the package. - setup_py_path = trainer_root_path / "setup.py" - - # The path to the generated source distribution. - source_distribution_path = ( - trainer_root_path - / "dist" - / f"{self._ROOT_MODULE}-{self._SETUP_PY_VERSION}.tar.gz" - ) - - trainer_root_path.mkdir() - trainer_path.mkdir() - - # Make empty __init__.py - with init_path.open("w"): - pass - - # Format the setup.py file. - setup_py_output = self._SETUP_PY_TEMPLATE.format( - name=self._ROOT_MODULE, - requirements=",".join(f'"{r}"' for r in self.requirements), - version=self._SETUP_PY_VERSION, - ) - - # Write setup.py - with setup_py_path.open("w") as fp: - fp.write(setup_py_output) - - # Copy script as module of python package. - shutil.copy(self.script_path, script_out_path) - - # Run setup.py to create the source distribution. - setup_cmd = [ - _get_python_executable() - ] + self._SETUP_PY_SOURCE_DISTRIBUTION_CMD.split() - - p = subprocess.Popen( - args=setup_cmd, - cwd=trainer_root_path, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - output, error = p.communicate() - - # Raise informative error if packaging fails. - if p.returncode != 0: - raise RuntimeError( - "Packaging of training script failed with code %d\n%s \n%s" - % (p.returncode, output.decode(), error.decode()) - ) - - return str(source_distribution_path) - - def package_and_copy(self, copy_method: Callable[[str], str]) -> str: - """Packages the script and executes copy with given copy_method. - - Args: - copy_method Callable[[str], str] - Takes a string path, copies to a desired location, and returns the - output path location. - Returns: - output_path str: Location of copied package. - """ - - with tempfile.TemporaryDirectory() as tmpdirname: - source_distribution_path = self.make_package(tmpdirname) - output_location = copy_method(source_distribution_path) - _LOGGER.info("Training script copied to:\n%s." % output_location) - return output_location - - def package_and_copy_to_gcs( - self, - gcs_staging_dir: str, - project: str = None, - credentials: Optional[auth_credentials.Credentials] = None, - ) -> str: - """Packages script in Python package and copies package to GCS bucket. - - Args - gcs_staging_dir (str): Required. GCS Staging directory. - project (str): Required. Project where GCS Staging bucket is located. - credentials (auth_credentials.Credentials): - Optional credentials used with GCS client. - Returns: - GCS location of Python package. - """ - - copy_method = functools.partial( - _timestamped_copy_to_gcs, - gcs_dir=gcs_staging_dir, - project=project, - credentials=credentials, - ) - return self.package_and_copy(copy_method=copy_method) - - -class _MachineSpec(NamedTuple): - """Specification container for Machine specs used for distributed training. - - Usage: - - spec = _MachineSpec( - replica_count=10, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80') - - Note that container and python package specs are not stored with this spec. - """ - - replica_count: int = 0 - machine_type: str = "n1-standard-4" - accelerator_count: int = 0 - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED" - - def _get_accelerator_type(self) -> Optional[str]: - """Validates accelerator_type and returns the name of the accelerator. - - Returns: - None if no accelerator or valid accelerator name. - - Raise: - ValueError if accelerator type is invalid. - """ - - # Raises ValueError if invalid accelerator_type - utils.validate_accelerator_type(self.accelerator_type) - - accelerator_enum = getattr( - gca_accelerator_type.AcceleratorType, self.accelerator_type - ) - - if ( - accelerator_enum - != gca_accelerator_type.AcceleratorType.ACCELERATOR_TYPE_UNSPECIFIED - ): - return self.accelerator_type - - @property - def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]: - """Return specification as a Dict.""" - spec = { - "machineSpec": {"machineType": self.machine_type}, - "replicaCount": self.replica_count, - } - accelerator_type = self._get_accelerator_type() - if accelerator_type and self.accelerator_count: - spec["machineSpec"]["acceleratorType"] = accelerator_type - spec["machineSpec"]["acceleratorCount"] = self.accelerator_count - - return spec - - @property - def is_empty(self) -> bool: - """Returns True is replica_count > 0 False otherwise.""" - return self.replica_count <= 0 - - -class _DistributedTrainingSpec(NamedTuple): - """Configuration for distributed training worker pool specs. - - AI Platform Training expects configuration in this order: - [ - chief spec, # can only have one replica - worker spec, - parameter server spec, - evaluator spec - ] - - Usage: - - dist_training_spec = _DistributedTrainingSpec( - chief_spec = _MachineSpec( - replica_count=1, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ), - worker_spec = _MachineSpec( - replica_count=10, - machine_type='n1-standard-4', - accelerator_count=2, - accelerator_type='NVIDIA_TESLA_K80' - ) - ) - """ - - chief_spec: _MachineSpec = _MachineSpec() - worker_spec: _MachineSpec = _MachineSpec() - parameter_server_spec: _MachineSpec = _MachineSpec() - evaluator_spec: _MachineSpec = _MachineSpec() - - @property - def pool_specs( - self, - ) -> List[Dict[str, Union[int, str, Dict[str, Union[int, str]]]]]: - """Return each pools spec in correct order for AI Platform as a list of - dicts. - - Also removes specs if they are empty but leaves specs in if there unusual - specifications to not break the ordering in AI Platform Training. - ie. 0 chief replica, 10 worker replica, 3 ps replica - - Returns: - Order list of worker pool specs suitable for AI Platform Training. - """ - if self.chief_spec.replica_count > 1: - raise ValueError("Chief spec replica count cannot be greater than 1.") - - spec_order = [ - self.chief_spec, - self.worker_spec, - self.parameter_server_spec, - self.evaluator_spec, - ] - specs = [s.spec_dict for s in spec_order] - for i in reversed(range(len(spec_order))): - if spec_order[i].is_empty: - specs.pop() - else: - break - return specs - - @classmethod - def chief_worker_pool( - cls, - replica_count: int = 0, - machine_type: str = "n1-standard-4", - accelerator_count: int = 0, - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", - ) -> "_DistributedTrainingSpec": - """Parameterizes Config to support only chief with worker replicas. - - For replica is assigned to chief and the remainder to workers. All spec have the - same machine type, accelerator count, and accelerator type. - - Args: - replica_count (int): - The number of worker replicas. Assigns 1 chief replica and - replica_count - 1 worker replicas. - machine_type (str): - The type of machine to use for training. - accelerator_type (str): - Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, - NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, - NVIDIA_TESLA_T4 - accelerator_count (int): - The number of accelerators to attach to a worker replica. - - Returns: - _DistributedTrainingSpec representing one chief and n workers all of same - type. If replica_count <= 0 then an empty spec is returned. - """ - if replica_count <= 0: - return cls() - - chief_spec = _MachineSpec( - replica_count=1, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ) - - worker_spec = _MachineSpec( - replica_count=replica_count - 1, - machine_type=machine_type, - accelerator_count=accelerator_count, - accelerator_type=accelerator_type, - ) - - return cls(chief_spec=chief_spec, worker_spec=worker_spec) - - class _CustomTrainingJob(_TrainingJob): """ABC for Custom Training Pipelines..""" @@ -1257,15 +806,15 @@ def __init__( container_uri (str): Required: Uri of the training container image in the GCR. model_serving_container_image_uri (str): - If the training produces a managed AI Platform Model, the URI of the + If the training produces a managed Vertex AI Model, the URI of the Model serving container suitable for serving the model produced by the training script. model_serving_container_predict_route (str): - If the training produces a managed AI Platform Model, An HTTP path to + If the training produces a managed Vertex AI Model, An HTTP path to send prediction requests to the container, and which must be supported - by it. If not specified a default HTTP path will be used by AI Platform. + by it. If not specified a default HTTP path will be used by Vertex AI. model_serving_container_health_route (str): - If the training produces a managed AI Platform Model, an HTTP path to + If the training produces a managed Vertex AI Model, an HTTP path to send health check requests to the container, and which must be supported by it. If not specified a standard HTTP path will be used by AI Platform. @@ -1290,7 +839,7 @@ def __init__( and values are environment variable values for those names. model_serving_container_ports (Sequence[int]): Declaration of ports that are exposed by the container. This field is - primarily informational, it gives AI Platform information about the + primarily informational, it gives Vertex AI information about the network connections the container uses. Listing or not a port here has no impact on whether the port is actually exposed, any port listening on the default "0.0.0.0" address inside a container will be accessible from @@ -1455,13 +1004,13 @@ def _prepare_and_validate_run( machine_type: str = "n1-standard-4", accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", accelerator_count: int = 0, - ) -> Tuple[_DistributedTrainingSpec, Optional[gca_model.Model]]: + ) -> Tuple[worker_spec_utils._DistributedTrainingSpec, Optional[gca_model.Model]]: """Create worker pool specs and managed model as well validating the run. Args: model_display_name (str): - If the script produces a managed AI Platform Model. The display name of + If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -1505,7 +1054,7 @@ def _prepare_and_validate_run( model_display_name = model_display_name or self._display_name + "-model" # validates args and will raise - worker_pool_specs = _DistributedTrainingSpec.chief_worker_pool( + worker_pool_specs = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=replica_count, machine_type=machine_type, accelerator_count=accelerator_count, @@ -1523,7 +1072,7 @@ def _prepare_and_validate_run( def _prepare_training_task_inputs_and_output_dir( self, - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, base_output_dir: Optional[str] = None, service_account: Optional[str] = None, network: Optional[str] = None, @@ -1531,7 +1080,7 @@ def _prepare_training_task_inputs_and_output_dir( """Prepares training task inputs and output directory for custom job. Args: - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. base_output_dir (str): GCS output directory of job. If not provided a @@ -1556,12 +1105,12 @@ def _prepare_training_task_inputs_and_output_dir( _LOGGER.info("Training Output directory:\n%s " % base_output_dir) training_task_inputs = { - "workerPoolSpecs": worker_pool_specs, - "baseOutputDirectory": {"output_uri_prefix": base_output_dir}, + "worker_pool_specs": worker_pool_specs, + "base_output_directory": {"output_uri_prefix": base_output_dir}, } if service_account: - training_task_inputs["serviceAccount"] = service_account + training_task_inputs["service_account"] = service_account if network: training_task_inputs["network"] = network @@ -1581,10 +1130,10 @@ def _model_upload_fail_string(self) -> str: # TODO(b/172368325) add scheduling, custom_job.Scheduling class CustomTrainingJob(_CustomTrainingJob): - """Class to launch a Custom Training Job in AI Platform using a script. + """Class to launch a Custom Training Job in Vertex AI using a script. Takes a training implementation as a python script and executes that - script in Cloud AI Platform Training. + script in Cloud Vertex AI Training. """ def __init__( @@ -1635,7 +1184,7 @@ def __init__( TODO(b/169782082) add documentation about traning utilities - To ensure your model gets saved in AI Platform, write your saved model to + To ensure your model gets saved in Vertex AI, write your saved model to os.environ["AIP_MODEL_DIR"] in your provided training script. @@ -1648,15 +1197,15 @@ def __init__( requirements (Sequence[str]): List of python packages dependencies of script. model_serving_container_image_uri (str): - If the training produces a managed AI Platform Model, the URI of the + If the training produces a managed Vertex AI Model, the URI of the Model serving container suitable for serving the model produced by the training script. model_serving_container_predict_route (str): - If the training produces a managed AI Platform Model, An HTTP path to + If the training produces a managed Vertex AI Model, An HTTP path to send prediction requests to the container, and which must be supported - by it. If not specified a default HTTP path will be used by AI Platform. + by it. If not specified a default HTTP path will be used by Vertex AI. model_serving_container_health_route (str): - If the training produces a managed AI Platform Model, an HTTP path to + If the training produces a managed Vertex AI Model, an HTTP path to send health check requests to the container, and which must be supported by it. If not specified a standard HTTP path will be used by AI Platform. @@ -1681,7 +1230,7 @@ def __init__( and values are environment variable values for those names. model_serving_container_ports (Sequence[int]): Declaration of ports that are exposed by the container. This field is - primarily informational, it gives AI Platform information about the + primarily informational, it gives Vertex AI information about the network connections the container uses. Listing or not a port here has no impact on whether the port is actually exposed, any port listening on the default "0.0.0.0" address inside a container will be accessible from @@ -1837,7 +1386,7 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform.If none of the fractions are set, by default roughly 80% + decided by Vertex AI.If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -1849,7 +1398,7 @@ def run( datasets.VideoDataset, ] ): - AI Platform to fit this training against. Custom training script should + Vertex AI to fit this training against. Custom training script should retrieve datasets through passed in environment variables uris: os.environ["AIP_TRAINING_DATA_URI"] @@ -1882,7 +1431,7 @@ def run( and ``annotation_schema_uri``. model_display_name (str): - If the script produces a managed AI Platform Model. The display name of + If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -1891,7 +1440,7 @@ def run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -1969,8 +1518,8 @@ def run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ worker_pool_specs, managed_model = self._prepare_and_validate_run( model_display_name=model_display_name, @@ -1981,7 +1530,7 @@ def run( ) # make and copy package - python_packager = _TrainingScriptPythonPackager( + python_packager = source_utils._TrainingScriptPythonPackager( script_path=self._script_path, requirements=self._requirements ) @@ -2007,7 +1556,7 @@ def run( @base.optional_sync(construct_object_on_arg="managed_model") def _run( self, - python_packager: _TrainingScriptPythonPackager, + python_packager: source_utils._TrainingScriptPythonPackager, dataset: Optional[ Union[ datasets.ImageDataset, @@ -2017,7 +1566,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -2034,7 +1583,7 @@ def _run( """Packages local script and launches training_job. Args: - python_packager (_TrainingScriptPythonPackager): + python_packager (source_utils._TrainingScriptPythonPackager): Required. Python Packager pointing to training script locally. dataset ( Union[ @@ -2044,11 +1593,11 @@ def _run( datasets.VideoDataset, ] ): - AI Platform to fit this training against. + Vertex AI to fit this training against. annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -2068,7 +1617,7 @@ def _run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -2122,8 +1671,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ package_gcs_uri = python_packager.package_and_copy_to_gcs( gcs_staging_dir=self._staging_bucket, @@ -2132,17 +1681,17 @@ def _run( ) for spec in worker_pool_specs: - spec["pythonPackageSpec"] = { - "executorImageUri": self._container_uri, - "pythonModule": python_packager.module_name, - "packageUris": [package_gcs_uri], + spec["python_package_spec"] = { + "executor_image_uri": self._container_uri, + "python_module": python_packager.module_name, + "package_uris": [package_gcs_uri], } if args: - spec["pythonPackageSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["pythonPackageSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] @@ -2175,7 +1724,7 @@ def _run( class CustomContainerTrainingJob(_CustomTrainingJob): - """Class to launch a Custom Training Job in AI Platform using a + """Class to launch a Custom Training Job in Vertex AI using a Container.""" def __init__( @@ -2224,7 +1773,7 @@ def __init__( TODO(b/169782082) add documentation about traning utilities - To ensure your model gets saved in AI Platform, write your saved model to + To ensure your model gets saved in Vertex AI, write your saved model to os.environ["AIP_MODEL_DIR"] in your provided training script. @@ -2237,15 +1786,15 @@ def __init__( The command to be invoked when the container is started. It overrides the entrypoint instruction in Dockerfile when provided model_serving_container_image_uri (str): - If the training produces a managed AI Platform Model, the URI of the + If the training produces a managed Vertex AI Model, the URI of the Model serving container suitable for serving the model produced by the training script. model_serving_container_predict_route (str): - If the training produces a managed AI Platform Model, An HTTP path to + If the training produces a managed Vertex AI Model, An HTTP path to send prediction requests to the container, and which must be supported - by it. If not specified a default HTTP path will be used by AI Platform. + by it. If not specified a default HTTP path will be used by Vertex AI. model_serving_container_health_route (str): - If the training produces a managed AI Platform Model, an HTTP path to + If the training produces a managed Vertex AI Model, an HTTP path to send health check requests to the container, and which must be supported by it. If not specified a standard HTTP path will be used by AI Platform. @@ -2270,7 +1819,7 @@ def __init__( and values are environment variable values for those names. model_serving_container_ports (Sequence[int]): Declaration of ports that are exposed by the container. This field is - primarily informational, it gives AI Platform information about the + primarily informational, it gives Vertex AI information about the network connections the container uses. Listing or not a port here has no impact on whether the port is actually exposed, any port listening on the default "0.0.0.0" address inside a container will be accessible from @@ -2425,12 +1974,12 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: dataset (Union[datasets.ImageDataset,datasets.TabularDataset,datasets.TextDataset,datasets.VideoDataset]): - AI Platform to fit this training against. Custom training script should + Vertex AI to fit this training against. Custom training script should retrieve datasets through passed in environment variables uris: os.environ["AIP_TRAINING_DATA_URI"] @@ -2463,7 +2012,7 @@ def run( and ``annotation_schema_uri``. model_display_name (str): - If the script produces a managed AI Platform Model. The display name of + If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -2472,7 +2021,7 @@ def run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -2550,8 +2099,8 @@ def run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. Raises: RuntimeError: If Training job has already been run, staging_bucket has not @@ -2596,7 +2145,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -2620,11 +2169,11 @@ def _run( datasets.VideoDataset, ] ): - AI Platform to fit this training against. + Vertex AI to fit this training against. annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -2644,7 +2193,7 @@ def _run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -2697,8 +2246,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ for spec in worker_pool_specs: @@ -2891,7 +2440,7 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -2942,7 +2491,7 @@ def run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. If the script produces a managed AI Platform Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -2958,8 +2507,8 @@ def run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. Raises: RuntimeError: If Training job has already been run or is waiting to run. @@ -3006,7 +2555,7 @@ def _run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -3057,7 +2606,7 @@ def _run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. If the script produces a managed AI Platform Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -3074,8 +2623,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ training_task_definition = schema.training_job.definition.automl_tabular @@ -3334,7 +2883,7 @@ def run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. If the script produces a managed AI Platform Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -3344,8 +2893,8 @@ def run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. Raises: RuntimeError if Training job has already been run or is waiting to run. @@ -3519,7 +3068,7 @@ def _run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. If the script produces a managed AI Platform Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -3529,8 +3078,8 @@ def _run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ training_task_definition = schema.training_job.definition.automl_forecasting @@ -3762,7 +3311,7 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -3796,7 +3345,7 @@ def run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. The display name of the managed AI Platform Model. The name + Optional. The display name of the managed Vertex AI Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. If not provided upon creation, the job's display_name is used. disable_early_stopping: bool = False @@ -3810,8 +3359,8 @@ def run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. Raises: RuntimeError: If Training job has already been run or is waiting to run. @@ -3854,7 +3403,7 @@ def _run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -3894,7 +3443,7 @@ def _run( will error. The minimum value is 1000 and the maximum is 72000. model_display_name (str): - Optional. The display name of the managed AI Platform Model. The name + Optional. The display name of the managed Vertex AI Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. If a `base_model` was provided, the display_name in the base_model will be overritten with this value. If not provided upon @@ -3911,8 +3460,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ # Retrieve the objective-specific training task schema based on prediction_type @@ -3942,7 +3491,7 @@ def _run( model_tbt.description = getattr(base_model._gca_resource, "description") model_tbt.labels = getattr(base_model._gca_resource, "labels") - # Set ID of AI Platform Model to base this training job off of + # Set ID of Vertex AI Model to base this training job off of training_task_inputs_dict["baseModelId"] = base_model.name return self._run_job( @@ -3965,11 +3514,11 @@ def _model_upload_fail_string(self) -> str: class CustomPythonPackageTrainingJob(_CustomTrainingJob): - """Class to launch a Custom Training Job in AI Platform using a Python + """Class to launch a Custom Training Job in Vertex AI using a Python Package. Takes a training implementation as a python package and executes - that package in Cloud AI Platform Training. + that package in Cloud Vertex AI Training. """ def __init__( @@ -4027,7 +3576,7 @@ def __init__( model_display_name='my-trained-model' ) - To ensure your model gets saved in AI Platform, write your saved model to + To ensure your model gets saved in Vertex AI, write your saved model to os.environ["AIP_MODEL_DIR"] in your provided training script. Args: @@ -4040,15 +3589,15 @@ def __init__( container_uri (str): Required: Uri of the training container image in the GCR. model_serving_container_image_uri (str): - If the training produces a managed AI Platform Model, the URI of the + If the training produces a managed Vertex AI Model, the URI of the Model serving container suitable for serving the model produced by the training script. model_serving_container_predict_route (str): - If the training produces a managed AI Platform Model, An HTTP path to + If the training produces a managed Vertex AI Model, An HTTP path to send prediction requests to the container, and which must be supported - by it. If not specified a default HTTP path will be used by AI Platform. + by it. If not specified a default HTTP path will be used by Vertex AI. model_serving_container_health_route (str): - If the training produces a managed AI Platform Model, an HTTP path to + If the training produces a managed Vertex AI Model, an HTTP path to send health check requests to the container, and which must be supported by it. If not specified a standard HTTP path will be used by AI Platform. @@ -4073,7 +3622,7 @@ def __init__( and values are environment variable values for those names. model_serving_container_ports (Sequence[int]): Declaration of ports that are exposed by the container. This field is - primarily informational, it gives AI Platform information about the + primarily informational, it gives Vertex AI information about the network connections the container uses. Listing or not a port here has no impact on whether the port is actually exposed, any port listening on the default "0.0.0.0" address inside a container will be accessible from @@ -4227,12 +3776,12 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform.If none of the fractions are set, by default roughly 80% + decided by Vertex AI.If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: dataset (Union[datasets.ImageDataset,datasets.TabularDataset,datasets.TextDataset,datasets.VideoDataset,]): - AI Platform to fit this training against. Custom training script should + Vertex AI to fit this training against. Custom training script should retrieve datasets through passed in environment variables uris: os.environ["AIP_TRAINING_DATA_URI"] @@ -4265,7 +3814,7 @@ def run( and ``annotation_schema_uri``. model_display_name (str): - If the script produces a managed AI Platform Model. The display name of + If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -4274,7 +3823,7 @@ def run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -4352,8 +3901,8 @@ def run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ worker_pool_specs, managed_model = self._prepare_and_validate_run( model_display_name=model_display_name, @@ -4393,7 +3942,7 @@ def _run( ] ], annotation_schema_uri: Optional[str], - worker_pool_specs: _DistributedTrainingSpec, + worker_pool_specs: worker_spec_utils._DistributedTrainingSpec, managed_model: Optional[gca_model.Model] = None, args: Optional[List[Union[str, float, int]]] = None, environment_variables: Optional[Dict[str, str]] = None, @@ -4418,11 +3967,11 @@ def _run( datasets.VideoDataset, ] ): - AI Platform to fit this training against. + Vertex AI to fit this training against. annotation_schema_uri (str): Google Cloud Storage URI points to a YAML file describing annotation schema. - worker_pools_spec (_DistributedTrainingSpec): + worker_pools_spec (worker_spec_utils._DistributedTrainingSpec): Worker pools pecs required to run job. managed_model (gca_model.Model): Model proto if this script produces a Managed Model. @@ -4442,7 +3991,7 @@ def _run( GCS output directory of job. If not provided a timestamped directory in the staging directory will be used. - AI Platform sets the following environment variables when it runs your training code: + Vertex AI sets the following environment variables when it runs your training code: - AIP_MODEL_DIR: a Cloud Storage URI of a directory intended for saving model artifacts, i.e. /model/ - AIP_CHECKPOINT_DIR: a Cloud Storage URI of a directory intended for saving checkpoints, i.e. /checkpoints/ @@ -4481,21 +4030,21 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ for spec in worker_pool_specs: - spec["pythonPackageSpec"] = { - "executorImageUri": self._container_uri, - "pythonModule": self._python_module, - "packageUris": [self._package_gcs_uri], + spec["python_package_spec"] = { + "executor_image_uri": self._container_uri, + "python_module": self._python_module, + "package_uris": [self._package_gcs_uri], } if args: - spec["pythonPackageSpec"]["args"] = args + spec["python_package_spec"]["args"] = args if environment_variables: - spec["pythonPackageSpec"]["env"] = [ + spec["python_package_spec"]["env"] = [ {"name": key, "value": value} for key, value in environment_variables.items() ] @@ -4679,7 +4228,7 @@ def run( Required. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. model_display_name (str): - Optional. The display name of the managed AI Platform Model. The name + Optional. The display name of the managed Vertex AI Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. If not provided upon creation, the job's display_name is used. sync: bool = True @@ -4687,8 +4236,8 @@ def run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. Raises: RuntimeError: If Training job has already been run or is waiting to run. @@ -4740,7 +4289,7 @@ def _run( Required. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. model_display_name (str): - Optional. The display name of the managed AI Platform Model. The name + Optional. The display name of the managed Vertex AI Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. If a `base_model` was provided, the display_name in the base_model will be overritten with this value. If not provided upon @@ -4751,8 +4300,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ # Retrieve the objective-specific training task schema based on prediction_type @@ -4815,7 +4364,7 @@ def __init__( The type of prediction the Model is to produce, one of: "classification" - A classification model analyzes text data and returns a list of categories that apply to the text found in the data. - AI Platform offers both single-label and multi-label text classification models. + Vertex AI offers both single-label and multi-label text classification models. "extraction" - An entity extraction model inspects text data for known entities referenced in the data and labels those entities in the text. @@ -4929,7 +4478,7 @@ def run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -4949,7 +4498,7 @@ def run( Required. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. model_display_name (str): - Optional. The display name of the managed AI Platform Model. + Optional. The display name of the managed Vertex AI Model. The name can be up to 128 characters long and can consist of any UTF-8 characters. @@ -4959,7 +4508,7 @@ def run( will be executed in concurrent Future and any downstream object will be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource. + model: The trained Vertex AI Model resource. Raises: RuntimeError: If Training job has already been run or is waiting to run. @@ -4996,7 +4545,7 @@ def _run( Any of ``training_fraction_split``, ``validation_fraction_split`` and ``test_fraction_split`` may optionally be provided, they must sum to up to 1. If the provided ones sum to less than 1, the remainder is assigned to sets as - decided by AI Platform. If none of the fractions are set, by default roughly 80% + decided by Vertex AI. If none of the fractions are set, by default roughly 80% of data will be used for training, 10% for validation, and 10% for test. Args: @@ -5018,7 +4567,7 @@ def _run( Required. The fraction of the input data that is to be used to evaluate the Model. This is ignored if Dataset is not provided. model_display_name (str): - Optional. If the script produces a managed AI Platform Model. The display name of + Optional. If the script produces a managed Vertex AI Model. The display name of the Model. The name can be up to 128 characters long and can be consist of any UTF-8 characters. @@ -5029,8 +4578,8 @@ def _run( be immediately returned and synced when the Future has completed. Returns: - model: The trained AI Platform Model resource or None if training did not - produce an AI Platform Model. + model: The trained Vertex AI Model resource or None if training did not + produce an Vertex AI Model. """ if model_display_name is None: diff --git a/google/cloud/aiplatform/training_utils.py b/google/cloud/aiplatform/training_utils.py deleted file mode 100644 index fea60c5005..0000000000 --- a/google/cloud/aiplatform/training_utils.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -import os - -from typing import Dict, Optional - - -class EnvironmentVariables: - """Passes on OS' environment variables.""" - - @property - def training_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for training data. None if - environment variable not set. - """ - return os.environ.get("AIP_TRAINING_DATA_URI") - - @property - def validation_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for validation data. None - if environment variable not set. - """ - return os.environ.get("AIP_VALIDATION_DATA_URI") - - @property - def test_data_uri(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for test data. None if - environment variable not set. - """ - return os.environ.get("AIP_TEST_DATA_URI") - - @property - def model_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving model artefacts. - None if environment variable not set. - """ - return os.environ.get("AIP_MODEL_DIR") - - @property - def checkpoint_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving checkpoints. - None if environment variable not set. - """ - return os.environ.get("AIP_CHECKPOINT_DIR") - - @property - def tensorboard_log_dir(self) -> Optional[str]: - """ - Returns: - Cloud Storage URI of a directory intended for saving TensorBoard logs. - None if environment variable not set. - """ - return os.environ.get("AIP_TENSORBOARD_LOG_DIR") - - @property - def cluster_spec(self) -> Optional[Dict]: - """ - Returns: - json string as described in https://cloud.google.com/ai-platform-unified/docs/training/distributed-training#cluster-variables - None if environment variable not set. - """ - cluster_spec_env = os.environ.get("CLUSTER_SPEC") - if cluster_spec_env is not None: - return json.loads(cluster_spec_env) - else: - return None - - @property - def tf_config(self) -> Optional[Dict]: - """ - Returns: - json string as described in https://cloud.google.com/ai-platform-unified/docs/training/distributed-training#tf-config - None if environment variable not set. - """ - tf_config_env = os.environ.get("TF_CONFIG") - if tf_config_env is not None: - return json.loads(tf_config_env) - else: - return None diff --git a/google/cloud/aiplatform/utils.py b/google/cloud/aiplatform/utils/__init__.py similarity index 84% rename from google/cloud/aiplatform/utils.py rename to google/cloud/aiplatform/utils/__init__.py index ff86fc1cb8..4404defb21 100644 --- a/google/cloud/aiplatform/utils.py +++ b/google/cloud/aiplatform/utils/__init__.py @@ -17,6 +17,8 @@ import abc +import datetime +import pathlib from collections import namedtuple import logging import re @@ -25,6 +27,8 @@ from google.api_core import client_options from google.api_core import gapic_v1 from google.auth import credentials as auth_credentials +from google.cloud import storage + from google.cloud.aiplatform import compat from google.cloud.aiplatform import constants from google.cloud.aiplatform import initializer @@ -52,8 +56,8 @@ accelerator_type as gca_accelerator_type, ) -AiPlatformServiceClient = TypeVar( - "AiPlatformServiceClient", +VertexAiServiceClient = TypeVar( + "VertexAiServiceClient", # v1beta1 dataset_service_client_v1beta1.DatasetServiceClient, endpoint_service_client_v1beta1.EndpointServiceClient, @@ -106,7 +110,7 @@ def extract_fields_from_resource_name( Args: resource_name (str): - Required. A fully-qualified AI Platform (Unified) resource name + Required. A fully-qualified Vertex AI resource name resource_noun (str): A resource noun to validate the resource name against. @@ -141,7 +145,7 @@ def full_resource_name( Args: resource_name (str): - Required. A fully-qualified AI Platform (Unified) resource name or + Required. A fully-qualified Vertex AI resource name or resource ID. resource_noun (str): A resource noun to validate the resource name against. @@ -159,7 +163,7 @@ def full_resource_name( Returns: resource_name (str): - A fully-qualified AI Platform (Unified) resource name. + A fully-qualified Vertex AI resource name. Raises: ValueError: @@ -253,7 +257,7 @@ def validate_region(region: str) -> bool: region = region.lower() if region not in constants.SUPPORTED_REGIONS: raise ValueError( - f"Unsupported region for AI Platform, select from {constants.SUPPORTED_REGIONS}" + f"Unsupported region for Vertex AI, select from {constants.SUPPORTED_REGIONS}" ) return True @@ -320,7 +324,7 @@ class WrappedClient: def __init__( self, - client_class: Type[AiPlatformServiceClient], + client_class: Type[VertexAiServiceClient], client_options: client_options.ClientOptions, client_info: gapic_v1.client_info.ClientInfo, credentials: Optional[auth_credentials.Credentials] = None, @@ -328,7 +332,7 @@ def __init__( """Stores parameters needed to instantiate client. Args: - client_class (AiPlatformServiceClient): + client_class (VertexAiServiceClient): Required. Class of the client to use. client_options (client_options.ClientOptions): Required. Client options to pass to client. @@ -406,7 +410,7 @@ def __getattr__(self, name: str) -> Any: """Instantiates client and returns attribute of the client.""" return getattr(self._clients[self._default_version], name) - def select_version(self, version: str) -> AiPlatformServiceClient: + def select_version(self, version: str) -> VertexAiServiceClient: return self._clients[version] @@ -480,8 +484,8 @@ class TensorboardClientWithOverride(ClientWithOverride): ) -AiPlatformServiceClientWithOverride = TypeVar( - "AiPlatformServiceClientWithOverride", +VertexAiServiceClientWithOverride = TypeVar( + "VertexAiServiceClientWithOverride", DatasetClientWithOverride, EndpointClientWithOverride, JobClientWithOverride, @@ -499,3 +503,66 @@ def __init__(self, warning_level: int): def filter(self, record): return record.levelname == self._warning_level + + +def _timestamped_gcs_dir(root_gcs_path: str, dir_name_prefix: str) -> str: + """Composes a timestamped GCS directory. + + Args: + root_gcs_path: GCS path to put the timestamped directory. + dir_name_prefix: Prefix to add the timestamped directory. + Returns: + Timestamped gcs directory path in root_gcs_path. + """ + timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") + dir_name = "-".join([dir_name_prefix, timestamp]) + if root_gcs_path.endswith("/"): + root_gcs_path = root_gcs_path[:-1] + gcs_path = "/".join([root_gcs_path, dir_name]) + if not gcs_path.startswith("gs://"): + return "gs://" + gcs_path + return gcs_path + + +def _timestamped_copy_to_gcs( + local_file_path: str, + gcs_dir: str, + project: Optional[str] = None, + credentials: Optional[auth_credentials.Credentials] = None, +) -> str: + """Copies a local file to a GCS path. + + The file copied to GCS is the name of the local file prepended with an + "aiplatform-{timestamp}-" string. + + Args: + local_file_path (str): Required. Local file to copy to GCS. + gcs_dir (str): + Required. The GCS directory to copy to. + project (str): + Project that contains the staging bucket. Default will be used if not + provided. Model Builder callers should pass this in. + credentials (auth_credentials.Credentials): + Custom credentials to use with bucket. Model Builder callers should pass + this in. + Returns: + gcs_path (str): The path of the copied file in gcs. + """ + + gcs_bucket, gcs_blob_prefix = extract_bucket_and_prefix_from_gcs_path(gcs_dir) + + local_file_name = pathlib.Path(local_file_path).name + timestamp = datetime.datetime.now().isoformat(sep="-", timespec="milliseconds") + blob_path = "-".join(["aiplatform", timestamp, local_file_name]) + + if gcs_blob_prefix: + blob_path = "/".join([gcs_blob_prefix, blob_path]) + + # TODO(b/171202993) add user agent + client = storage.Client(project=project, credentials=credentials) + bucket = client.bucket(gcs_bucket) + blob = bucket.blob(blob_path) + blob.upload_from_filename(local_file_path) + + gcs_path = "".join(["gs://", "/".join([blob.bucket.name, blob.name])]) + return gcs_path diff --git a/google/cloud/aiplatform/utils/source_utils.py b/google/cloud/aiplatform/utils/source_utils.py new file mode 100644 index 0000000000..b7fcef806f --- /dev/null +++ b/google/cloud/aiplatform/utils/source_utils.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import functools +import pathlib +import shutil +import subprocess +import sys +import tempfile +from typing import Optional, Sequence, Callable + +from google.auth import credentials as auth_credentials +from google.cloud.aiplatform import base +from google.cloud.aiplatform import utils + +_LOGGER = base.Logger(__name__) + + +def _get_python_executable() -> str: + """Returns Python executable. + + Returns: + Python executable to use for setuptools packaging. + Raises: + EnvironmentError: If Python executable is not found. + """ + + python_executable = sys.executable + + if not python_executable: + raise EnvironmentError("Cannot find Python executable for packaging.") + return python_executable + + +class _TrainingScriptPythonPackager: + """Converts a Python script into Python package suitable for aiplatform + training. + + Copies the script to specified location. + + Class Attributes: + _TRAINER_FOLDER: Constant folder name to build package. + _ROOT_MODULE: Constant root name of module. + _TEST_MODULE_NAME: Constant name of module that will store script. + _SETUP_PY_VERSION: Constant version of this created python package. + _SETUP_PY_TEMPLATE: Constant template used to generate setup.py file. + _SETUP_PY_SOURCE_DISTRIBUTION_CMD: + Constant command to generate the source distribution package. + + Attributes: + script_path: local path of script to package + requirements: list of Python dependencies to add to package + + Usage: + + packager = TrainingScriptPythonPackager('my_script.py', ['pandas', 'pytorch']) + gcs_path = packager.package_and_copy_to_gcs( + gcs_staging_dir='my-bucket', + project='my-prject') + module_name = packager.module_name + + The package after installed can be executed as: + python -m aiplatform_custom_trainer_script.task + """ + + _TRAINER_FOLDER = "trainer" + _ROOT_MODULE = "aiplatform_custom_trainer_script" + _TASK_MODULE_NAME = "task" + _SETUP_PY_VERSION = "0.1" + + _SETUP_PY_TEMPLATE = """from setuptools import find_packages +from setuptools import setup + +setup( + name='{name}', + version='{version}', + packages=find_packages(), + install_requires=({requirements}), + include_package_data=True, + description='My training application.' +)""" + + _SETUP_PY_SOURCE_DISTRIBUTION_CMD = "setup.py sdist --formats=gztar" + + # Module name that can be executed during training. ie. python -m + module_name = f"{_ROOT_MODULE}.{_TASK_MODULE_NAME}" + + def __init__(self, script_path: str, requirements: Optional[Sequence[str]] = None): + """Initializes packager. + + Args: + script_path (str): Required. Local path to script. + requirements (Sequence[str]): + List of python packages dependencies of script. + """ + + self.script_path = script_path + self.requirements = requirements or [] + + def make_package(self, package_directory: str) -> str: + """Converts script into a Python package suitable for python module + execution. + + Args: + package_directory (str): Directory to build package in. + Returns: + source_distribution_path (str): Path to built package. + Raises: + RunTimeError: If package creation fails. + """ + # The root folder to builder the package in + package_path = pathlib.Path(package_directory) + + # Root directory of the package + trainer_root_path = package_path / self._TRAINER_FOLDER + + # The root module of the python package + trainer_path = trainer_root_path / self._ROOT_MODULE + + # __init__.py path in root module + init_path = trainer_path / "__init__.py" + + # The module that will contain the script + script_out_path = trainer_path / f"{self._TASK_MODULE_NAME}.py" + + # The path to setup.py in the package. + setup_py_path = trainer_root_path / "setup.py" + + # The path to the generated source distribution. + source_distribution_path = ( + trainer_root_path + / "dist" + / f"{self._ROOT_MODULE}-{self._SETUP_PY_VERSION}.tar.gz" + ) + + trainer_root_path.mkdir() + trainer_path.mkdir() + + # Make empty __init__.py + with init_path.open("w"): + pass + + # Format the setup.py file. + setup_py_output = self._SETUP_PY_TEMPLATE.format( + name=self._ROOT_MODULE, + requirements=",".join(f'"{r}"' for r in self.requirements), + version=self._SETUP_PY_VERSION, + ) + + # Write setup.py + with setup_py_path.open("w") as fp: + fp.write(setup_py_output) + + # Copy script as module of python package. + shutil.copy(self.script_path, script_out_path) + + # Run setup.py to create the source distribution. + setup_cmd = [ + _get_python_executable() + ] + self._SETUP_PY_SOURCE_DISTRIBUTION_CMD.split() + + p = subprocess.Popen( + args=setup_cmd, + cwd=trainer_root_path, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output, error = p.communicate() + + # Raise informative error if packaging fails. + if p.returncode != 0: + raise RuntimeError( + "Packaging of training script failed with code %d\n%s \n%s" + % (p.returncode, output.decode(), error.decode()) + ) + + return str(source_distribution_path) + + def package_and_copy(self, copy_method: Callable[[str], str]) -> str: + """Packages the script and executes copy with given copy_method. + + Args: + copy_method Callable[[str], str] + Takes a string path, copies to a desired location, and returns the + output path location. + Returns: + output_path str: Location of copied package. + """ + + with tempfile.TemporaryDirectory() as tmpdirname: + source_distribution_path = self.make_package(tmpdirname) + output_location = copy_method(source_distribution_path) + _LOGGER.info("Training script copied to:\n%s." % output_location) + return output_location + + def package_and_copy_to_gcs( + self, + gcs_staging_dir: str, + project: str = None, + credentials: Optional[auth_credentials.Credentials] = None, + ) -> str: + """Packages script in Python package and copies package to GCS bucket. + + Args + gcs_staging_dir (str): Required. GCS Staging directory. + project (str): Required. Project where GCS Staging bucket is located. + credentials (auth_credentials.Credentials): + Optional credentials used with GCS client. + Returns: + GCS location of Python package. + """ + + copy_method = functools.partial( + utils._timestamped_copy_to_gcs, + gcs_dir=gcs_staging_dir, + project=project, + credentials=credentials, + ) + return self.package_and_copy(copy_method=copy_method) diff --git a/google/cloud/aiplatform/utils/worker_spec_utils.py b/google/cloud/aiplatform/utils/worker_spec_utils.py new file mode 100644 index 0000000000..385ac83979 --- /dev/null +++ b/google/cloud/aiplatform/utils/worker_spec_utils.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import NamedTuple, Optional, Dict, Union, List + +from google.cloud.aiplatform import utils +from google.cloud.aiplatform.compat.types import ( + accelerator_type as gca_accelerator_type_compat, +) + + +class _MachineSpec(NamedTuple): + """Specification container for Machine specs used for distributed training. + + Usage: + + spec = _MachineSpec( + replica_count=10, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80') + + Note that container and python package specs are not stored with this spec. + """ + + replica_count: int = 0 + machine_type: str = "n1-standard-4" + accelerator_count: int = 0 + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED" + + def _get_accelerator_type(self) -> Optional[str]: + """Validates accelerator_type and returns the name of the accelerator. + + Returns: + None if no accelerator or valid accelerator name. + + Raise: + ValueError if accelerator type is invalid. + """ + + # Raises ValueError if invalid accelerator_type + utils.validate_accelerator_type(self.accelerator_type) + + accelerator_enum = getattr( + gca_accelerator_type_compat.AcceleratorType, self.accelerator_type + ) + + if ( + accelerator_enum + != gca_accelerator_type_compat.AcceleratorType.ACCELERATOR_TYPE_UNSPECIFIED + ): + return self.accelerator_type + + @property + def spec_dict(self) -> Dict[str, Union[int, str, Dict[str, Union[int, str]]]]: + """Return specification as a Dict.""" + spec = { + "machine_spec": {"machine_type": self.machine_type}, + "replica_count": self.replica_count, + } + accelerator_type = self._get_accelerator_type() + if accelerator_type and self.accelerator_count: + spec["machine_spec"]["accelerator_type"] = accelerator_type + spec["machine_spec"]["accelerator_count"] = self.accelerator_count + + return spec + + @property + def is_empty(self) -> bool: + """Returns True is replica_count > 0 False otherwise.""" + return self.replica_count <= 0 + + +class _DistributedTrainingSpec(NamedTuple): + """Configuration for distributed training worker pool specs. + + Vertex AI Training expects configuration in this order: + [ + chief spec, # can only have one replica + worker spec, + parameter server spec, + evaluator spec + ] + + Usage: + + dist_training_spec = _DistributedTrainingSpec( + chief_spec = _MachineSpec( + replica_count=1, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80' + ), + worker_spec = _MachineSpec( + replica_count=10, + machine_type='n1-standard-4', + accelerator_count=2, + accelerator_type='NVIDIA_TESLA_K80' + ) + ) + """ + + chief_spec: _MachineSpec = _MachineSpec() + worker_spec: _MachineSpec = _MachineSpec() + parameter_server_spec: _MachineSpec = _MachineSpec() + evaluator_spec: _MachineSpec = _MachineSpec() + + @property + def pool_specs( + self, + ) -> List[Dict[str, Union[int, str, Dict[str, Union[int, str]]]]]: + """Return each pools spec in correct order for Vertex AI as a list of + dicts. + + Also removes specs if they are empty but leaves specs in if there unusual + specifications to not break the ordering in Vertex AI Training. + ie. 0 chief replica, 10 worker replica, 3 ps replica + + Returns: + Order list of worker pool specs suitable for Vertex AI Training. + """ + if self.chief_spec.replica_count > 1: + raise ValueError("Chief spec replica count cannot be greater than 1.") + + spec_order = [ + self.chief_spec, + self.worker_spec, + self.parameter_server_spec, + self.evaluator_spec, + ] + specs = [s.spec_dict for s in spec_order] + for i in reversed(range(len(spec_order))): + if spec_order[i].is_empty: + specs.pop() + else: + break + return specs + + @classmethod + def chief_worker_pool( + cls, + replica_count: int = 0, + machine_type: str = "n1-standard-4", + accelerator_count: int = 0, + accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + ) -> "_DistributedTrainingSpec": + """Parameterizes Config to support only chief with worker replicas. + + For replica is assigned to chief and the remainder to workers. All spec have the + same machine type, accelerator count, and accelerator type. + + Args: + replica_count (int): + The number of worker replicas. Assigns 1 chief replica and + replica_count - 1 worker replicas. + machine_type (str): + The type of machine to use for training. + accelerator_type (str): + Hardware accelerator type. One of ACCELERATOR_TYPE_UNSPECIFIED, + NVIDIA_TESLA_K80, NVIDIA_TESLA_P100, NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, + NVIDIA_TESLA_T4 + accelerator_count (int): + The number of accelerators to attach to a worker replica. + + Returns: + _DistributedTrainingSpec representing one chief and n workers all of same + type. If replica_count <= 0 then an empty spec is returned. + """ + if replica_count <= 0: + return cls() + + chief_spec = _MachineSpec( + replica_count=1, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ) + + worker_spec = _MachineSpec( + replica_count=replica_count - 1, + machine_type=machine_type, + accelerator_count=accelerator_count, + accelerator_type=accelerator_type, + ) + + return cls(chief_spec=chief_spec, worker_spec=worker_spec) diff --git a/setup.py b/setup.py index a40d87c1da..6b11cfe2e3 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools # type: ignore name = "google-cloud-aiplatform" -version = "0.9.0" +version = "1.0.0" description = "Cloud AI Platform API client library" package_root = os.path.abspath(os.path.dirname(__file__)) @@ -73,7 +73,7 @@ python_requires=">=3.6", scripts=[], classifiers=[ - "Development Status :: 4 - Beta", + "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Operating System :: OS Independent", "Programming Language :: Python :: 3.6", diff --git a/tests/unit/aiplatform/test_custom_job.py b/tests/unit/aiplatform/test_custom_job.py new file mode 100644 index 0000000000..7797e0edef --- /dev/null +++ b/tests/unit/aiplatform/test_custom_job.py @@ -0,0 +1,392 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import copy +from importlib import reload +from unittest import mock +from unittest.mock import patch + +from google.protobuf import duration_pb2 # type: ignore +from google.rpc import status_pb2 + +import test_training_jobs +from test_training_jobs import mock_python_package_to_gcs # noqa: F401 + +from google.cloud import aiplatform +from google.cloud.aiplatform.compat.types import custom_job as gca_custom_job_compat +from google.cloud.aiplatform.compat.types import ( + custom_job_v1beta1 as gca_custom_job_v1beta1, +) +from google.cloud.aiplatform.compat.types import io as gca_io_compat +from google.cloud.aiplatform.compat.types import job_state as gca_job_state_compat +from google.cloud.aiplatform.compat.types import ( + encryption_spec as gca_encryption_spec_compat, +) +from google.cloud.aiplatform_v1.services.job_service import client as job_service_client +from google.cloud.aiplatform_v1beta1.services.job_service import ( + client as job_service_client_v1beta1, +) + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_ID = "1028944691210842416" +_TEST_DISPLAY_NAME = "my_job_1234" + +_TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}" + +_TEST_CUSTOM_JOB_NAME = f"{_TEST_PARENT}/customJobs/{_TEST_ID}" +_TEST_TENSORBOARD_NAME = f"{_TEST_PARENT}/tensorboards/{_TEST_ID}" + +_TEST_TRAINING_CONTAINER_IMAGE = "gcr.io/test-training/container:image" + +_TEST_WORKER_POOL_SPEC = [ + { + "machine_spec": { + "machine_type": "n1-standard-4", + "accelerator_type": "NVIDIA_TESLA_K80", + "accelerator_count": 1, + }, + "replica_count": 1, + "container_spec": { + "image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "command": [], + "args": [], + }, + } +] + +_TEST_STAGING_BUCKET = "gs://test-staging-bucket" + +# CMEK encryption +_TEST_DEFAULT_ENCRYPTION_KEY_NAME = "key_default" +_TEST_DEFAULT_ENCRYPTION_SPEC = gca_encryption_spec_compat.EncryptionSpec( + kms_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME +) + +_TEST_SERVICE_ACCOUNT = "vinnys@my-project.iam.gserviceaccount.com" + + +_TEST_NETWORK = f"projects/{_TEST_PROJECT}/global/networks/{_TEST_ID}" + +_TEST_TIMEOUT = 8000 +_TEST_RESTART_JOB_ON_WORKER_RESTART = True + +_TEST_BASE_CUSTOM_JOB_PROTO = gca_custom_job_compat.CustomJob( + display_name=_TEST_DISPLAY_NAME, + job_spec=gca_custom_job_compat.CustomJobSpec( + worker_pool_specs=_TEST_WORKER_POOL_SPEC, + base_output_directory=gca_io_compat.GcsDestination( + output_uri_prefix=_TEST_STAGING_BUCKET + ), + scheduling=gca_custom_job_compat.Scheduling( + timeout=duration_pb2.Duration(seconds=_TEST_TIMEOUT), + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + ), + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + ), + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, +) + + +def _get_custom_job_proto(state=None, name=None, error=None, version="v1"): + custom_job_proto = copy.deepcopy(_TEST_BASE_CUSTOM_JOB_PROTO) + custom_job_proto.name = name + custom_job_proto.state = state + custom_job_proto.error = error + + if version == "v1beta1": + v1beta1_custom_job_proto = gca_custom_job_v1beta1.CustomJob() + v1beta1_custom_job_proto._pb.MergeFromString( + custom_job_proto._pb.SerializeToString() + ) + custom_job_proto = v1beta1_custom_job_proto + custom_job_proto.job_spec.tensorboard = _TEST_TENSORBOARD_NAME + + return custom_job_proto + + +@pytest.fixture +def get_custom_job_mock(): + with patch.object( + job_service_client.JobServiceClient, "get_custom_job" + ) as get_custom_job_mock: + get_custom_job_mock.side_effect = [ + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED, + ), + ] + yield get_custom_job_mock + + +@pytest.fixture +def get_custom_job_mock_with_fail(): + with patch.object( + job_service_client.JobServiceClient, "get_custom_job" + ) as get_custom_job_mock: + get_custom_job_mock.side_effect = [ + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_FAILED, + error=status_pb2.Status(message="Test Error"), + ), + ] + yield get_custom_job_mock + + +@pytest.fixture +def create_custom_job_mock(): + with mock.patch.object( + job_service_client.JobServiceClient, "create_custom_job" + ) as create_custom_job_mock: + create_custom_job_mock.return_value = _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ) + yield create_custom_job_mock + + +@pytest.fixture +def create_custom_job_v1beta1_mock(): + with mock.patch.object( + job_service_client_v1beta1.JobServiceClient, "create_custom_job" + ) as create_custom_job_mock: + create_custom_job_mock.return_value = _get_custom_job_proto( + name=_TEST_CUSTOM_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + version="v1beta1", + ) + yield create_custom_job_mock + + +class TestCustomJob: + def setup_method(self): + reload(aiplatform.initializer) + reload(aiplatform) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_custom_job(self, create_custom_job_mock, get_custom_job_mock, sync): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_custom_job = _get_custom_job_proto() + + create_custom_job_mock.assert_called_once_with( + parent=_TEST_PARENT, custom_job=expected_custom_job + ) + + assert job.job_spec == expected_custom_job.job_spec + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_run_custom_job_with_fail_raises( + self, create_custom_job_mock, get_custom_job_mock_with_fail, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + with pytest.raises(RuntimeError): + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_custom_job = _get_custom_job_proto() + + create_custom_job_mock.assert_called_once_with( + parent=_TEST_PARENT, custom_job=expected_custom_job + ) + + assert job.job_spec == expected_custom_job.job_spec + assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED + + def test_custom_job_get_state_raises_without_run(self): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + with pytest.raises(RuntimeError): + print(job.state) + + def test_no_staging_bucket_raises(self): + + aiplatform.init(project=_TEST_PROJECT, location=_TEST_LOCATION) + + with pytest.raises(RuntimeError): + job = aiplatform.CustomJob( # noqa: F841 + display_name=_TEST_DISPLAY_NAME, + worker_pool_specs=_TEST_WORKER_POOL_SPEC, + ) + + def test_get_custom_job(self, get_custom_job_mock): + + job = aiplatform.CustomJob.get(_TEST_CUSTOM_JOB_NAME) + + get_custom_job_mock.assert_called_once_with(name=_TEST_CUSTOM_JOB_NAME) + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_PENDING + ) + assert job.job_spec == _TEST_BASE_CUSTOM_JOB_PROTO.job_spec + + @pytest.mark.usefixtures("mock_python_package_to_gcs") + @pytest.mark.parametrize("sync", [True, False]) + def test_create_from_local_script( + self, get_custom_job_mock, create_custom_job_mock, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + # configuration on this is tested in test_training_jobs.py + job = aiplatform.CustomJob.from_local_script( + display_name=_TEST_DISPLAY_NAME, + script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, + container_uri=_TEST_TRAINING_CONTAINER_IMAGE, + ) + + job.run(sync=sync) + + job.wait() + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.usefixtures("mock_python_package_to_gcs") + @pytest.mark.parametrize("sync", [True, False]) + def test_create_from_local_script_raises_with_no_staging_bucket( + self, get_custom_job_mock, create_custom_job_mock, sync + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + with pytest.raises(RuntimeError): + + # configuration on this is tested in test_training_jobs.py + job = aiplatform.CustomJob.from_local_script( # noqa: F841 + display_name=_TEST_DISPLAY_NAME, + script_path=test_training_jobs._TEST_LOCAL_SCRIPT_FILE_NAME, + container_uri=_TEST_TRAINING_CONTAINER_IMAGE, + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_custom_job_with_tensorboard( + self, create_custom_job_v1beta1_mock, get_custom_job_mock, sync + ): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + job = aiplatform.CustomJob( + display_name=_TEST_DISPLAY_NAME, worker_pool_specs=_TEST_WORKER_POOL_SPEC + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + tensorboard=_TEST_TENSORBOARD_NAME, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_custom_job = _get_custom_job_proto(version="v1beta1") + + create_custom_job_v1beta1_mock.assert_called_once_with( + parent=_TEST_PARENT, custom_job=expected_custom_job + ) + + expected_custom_job = _get_custom_job_proto() + + assert job.job_spec == expected_custom_job.job_spec + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) diff --git a/tests/unit/aiplatform/test_end_to_end.py b/tests/unit/aiplatform/test_end_to_end.py index 69c5517a69..4aede65f08 100644 --- a/tests/unit/aiplatform/test_end_to_end.py +++ b/tests/unit/aiplatform/test_end_to_end.py @@ -19,11 +19,11 @@ from importlib import reload +from google.cloud.aiplatform.utils import source_utils from google.cloud import aiplatform from google.cloud.aiplatform import initializer from google.cloud.aiplatform import models from google.cloud.aiplatform import schema -from google.cloud.aiplatform import training_jobs from google.cloud.aiplatform_v1.types import ( dataset as gca_dataset, @@ -204,16 +204,16 @@ def test_dataset_create_to_model_predict( true_args = test_training_jobs._TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": test_training_jobs._TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": test_training_jobs._TEST_MACHINE_TYPE, - "acceleratorType": test_training_jobs._TEST_ACCELERATOR_TYPE, - "acceleratorCount": test_training_jobs._TEST_ACCELERATOR_COUNT, + "replica_count": test_training_jobs._TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": test_training_jobs._TEST_MACHINE_TYPE, + "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, + "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -248,8 +248,8 @@ def test_dataset_create_to_model_predict( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": { + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { "output_uri_prefix": test_training_jobs._TEST_BASE_OUTPUT_DIR }, }, @@ -385,16 +385,16 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( true_args = test_training_jobs._TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": test_training_jobs._TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": test_training_jobs._TEST_MACHINE_TYPE, - "acceleratorType": test_training_jobs._TEST_ACCELERATOR_TYPE, - "acceleratorCount": test_training_jobs._TEST_ACCELERATOR_COUNT, + "replica_count": test_training_jobs._TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": test_training_jobs._TEST_MACHINE_TYPE, + "accelerator_type": test_training_jobs._TEST_ACCELERATOR_TYPE, + "accelerator_count": test_training_jobs._TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": test_training_jobs._TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [test_training_jobs._TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -430,8 +430,8 @@ def test_dataset_create_to_model_predict_with_pipeline_fail( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": { + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { "output_uri_prefix": test_training_jobs._TEST_BASE_OUTPUT_DIR }, }, diff --git a/tests/unit/aiplatform/test_hyperparameter_tuning_job.py b/tests/unit/aiplatform/test_hyperparameter_tuning_job.py new file mode 100644 index 0000000000..f4102fc3bb --- /dev/null +++ b/tests/unit/aiplatform/test_hyperparameter_tuning_job.py @@ -0,0 +1,467 @@ +# -*- coding: utf-8 -*- +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + +import copy +from importlib import reload +from unittest import mock +from unittest.mock import patch + +from google.rpc import status_pb2 + +from google.cloud import aiplatform +from google.cloud.aiplatform import hyperparameter_tuning as hpt +from google.cloud.aiplatform.compat.types import job_state as gca_job_state_compat +from google.cloud.aiplatform.compat.types import ( + encryption_spec as gca_encryption_spec_compat, +) +from google.cloud.aiplatform.compat.types import ( + hyperparameter_tuning_job as gca_hyperparameter_tuning_job_compat, + hyperparameter_tuning_job_v1beta1 as gca_hyperparameter_tuning_job_v1beta1, +) +from google.cloud.aiplatform.compat.types import study as gca_study_compat +from google.cloud.aiplatform_v1.services.job_service import client as job_service_client +from google.cloud.aiplatform_v1beta1.services.job_service import ( + client as job_service_client_v1beta1, +) + +import test_custom_job + +_TEST_PROJECT = "test-project" +_TEST_LOCATION = "us-central1" +_TEST_ID = "1028944691210842416" +_TEST_DISPLAY_NAME = "my_hp_job_1234" + +_TEST_PARENT = f"projects/{_TEST_PROJECT}/locations/{_TEST_LOCATION}" + +_TEST_STAGING_BUCKET = test_custom_job._TEST_STAGING_BUCKET + +_TEST_HYPERPARAMETERTUNING_JOB_NAME = ( + f"{_TEST_PARENT}/hyperparameterTuningJobs/{_TEST_ID}" +) + +# CMEK encryption +_TEST_DEFAULT_ENCRYPTION_KEY_NAME = "key_default" +_TEST_DEFAULT_ENCRYPTION_SPEC = gca_encryption_spec_compat.EncryptionSpec( + kms_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME +) + +_TEST_SERVICE_ACCOUNT = "vinnys@my-project.iam.gserviceaccount.com" + + +_TEST_NETWORK = f"projects/{_TEST_PROJECT}/global/networks/{_TEST_ID}" + +_TEST_TIMEOUT = 8000 +_TEST_RESTART_JOB_ON_WORKER_RESTART = True + +_TEST_METRIC_SPEC_KEY = "test-metric" +_TEST_METRIC_SPEC_VALUE = "maximize" + +_TEST_PARALLEL_TRIAL_COUNT = 8 +_TEST_MAX_TRIAL_COUNT = 64 +_TEST_MAX_FAILED_TRIAL_COUNT = 4 +_TEST_SEARCH_ALGORITHM = "random" +_TEST_MEASUREMENT_SELECTION = "best" + + +_TEST_BASE_HYPERPARAMETER_TUNING_JOB_PROTO = gca_hyperparameter_tuning_job_compat.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + study_spec=gca_study_compat.StudySpec( + metrics=[ + gca_study_compat.StudySpec.MetricSpec( + metric_id=_TEST_METRIC_SPEC_KEY, goal=_TEST_METRIC_SPEC_VALUE.upper() + ) + ], + parameters=[ + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="lr", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LOG_SCALE, + double_value_spec=gca_study_compat.StudySpec.ParameterSpec.DoubleValueSpec( + min_value=0.001, max_value=0.1 + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="units", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + integer_value_spec=gca_study_compat.StudySpec.ParameterSpec.IntegerValueSpec( + min_value=4, max_value=1028 + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="activation", + categorical_value_spec=gca_study_compat.StudySpec.ParameterSpec.CategoricalValueSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + ), + gca_study_compat.StudySpec.ParameterSpec( + parameter_id="batch_size", + scale_type=gca_study_compat.StudySpec.ParameterSpec.ScaleType.UNIT_LINEAR_SCALE, + discrete_value_spec=gca_study_compat.StudySpec.ParameterSpec.DiscreteValueSpec( + values=[16, 32] + ), + ), + ], + algorithm=gca_study_compat.StudySpec.Algorithm.RANDOM_SEARCH, + measurement_selection_type=gca_study_compat.StudySpec.MeasurementSelectionType.BEST_MEASUREMENT, + ), + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + trial_job_spec=test_custom_job._TEST_BASE_CUSTOM_JOB_PROTO.job_spec, + encryption_spec=_TEST_DEFAULT_ENCRYPTION_SPEC, +) + + +def _get_hyperparameter_tuning_job_proto( + state=None, name=None, error=None, version="v1" +): + hyperparameter_tuning_job_proto = copy.deepcopy( + _TEST_BASE_HYPERPARAMETER_TUNING_JOB_PROTO + ) + hyperparameter_tuning_job_proto.name = name + hyperparameter_tuning_job_proto.state = state + hyperparameter_tuning_job_proto.error = error + + if version == "v1beta1": + v1beta1_hyperparameter_tuning_job_proto = ( + gca_hyperparameter_tuning_job_v1beta1.HyperparameterTuningJob() + ) + v1beta1_hyperparameter_tuning_job_proto._pb.MergeFromString( + hyperparameter_tuning_job_proto._pb.SerializeToString() + ) + hyperparameter_tuning_job_proto = v1beta1_hyperparameter_tuning_job_proto + hyperparameter_tuning_job_proto.trial_job_spec.tensorboard = ( + test_custom_job._TEST_TENSORBOARD_NAME + ) + + return hyperparameter_tuning_job_proto + + +@pytest.fixture +def get_hyperparameter_tuning_job_mock(): + with patch.object( + job_service_client.JobServiceClient, "get_hyperparameter_tuning_job" + ) as get_hyperparameter_tuning_job_mock: + get_hyperparameter_tuning_job_mock.side_effect = [ + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED, + ), + ] + yield get_hyperparameter_tuning_job_mock + + +@pytest.fixture +def get_hyperparameter_tuning_job_mock_with_fail(): + with patch.object( + job_service_client.JobServiceClient, "get_hyperparameter_tuning_job" + ) as get_hyperparameter_tuning_job_mock: + get_hyperparameter_tuning_job_mock.side_effect = [ + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_RUNNING, + ), + _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_FAILED, + error=status_pb2.Status(message="Test Error"), + ), + ] + yield get_hyperparameter_tuning_job_mock + + +@pytest.fixture +def create_hyperparameter_tuning_job_mock(): + with mock.patch.object( + job_service_client.JobServiceClient, "create_hyperparameter_tuning_job" + ) as create_hyperparameter_tuning_job_mock: + create_hyperparameter_tuning_job_mock.return_value = _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + ) + yield create_hyperparameter_tuning_job_mock + + +@pytest.fixture +def create_hyperparameter_tuning_job_v1beta1_mock(): + with mock.patch.object( + job_service_client_v1beta1.JobServiceClient, "create_hyperparameter_tuning_job" + ) as create_hyperparameter_tuning_job_mock: + create_hyperparameter_tuning_job_mock.return_value = _get_hyperparameter_tuning_job_proto( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME, + state=gca_job_state_compat.JobState.JOB_STATE_PENDING, + version="v1beta1", + ) + yield create_hyperparameter_tuning_job_mock + + +class TestHyperparameterTuningJob: + def setup_method(self): + reload(aiplatform.initializer) + reload(aiplatform) + + def teardown_method(self): + aiplatform.initializer.global_pool.shutdown(wait=True) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_hyperparameter_tuning_job( + self, + create_hyperparameter_tuning_job_mock, + get_hyperparameter_tuning_job_mock, + sync, + ): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() + + create_hyperparameter_tuning_job_mock.assert_called_once_with( + parent=_TEST_PARENT, + hyperparameter_tuning_job=expected_hyperparameter_tuning_job, + ) + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_run_hyperparameter_tuning_job_with_fail_raises( + self, + create_hyperparameter_tuning_job_mock, + get_hyperparameter_tuning_job_mock_with_fail, + sync, + ): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + with pytest.raises(RuntimeError): + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + sync=sync, + ) + + job.wait() + + expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() + + create_hyperparameter_tuning_job_mock.assert_called_once_with( + parent=_TEST_PARENT, + hyperparameter_tuning_job=expected_hyperparameter_tuning_job, + ) + + assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED + + def test_hyperparameter_tuning_job_get_state_raises_without_run(self): + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32, 64], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + with pytest.raises(RuntimeError): + print(job.state) + + def test_get_hyperparameter_tuning_job(self, get_hyperparameter_tuning_job_mock): + + job = aiplatform.HyperparameterTuningJob.get( + _TEST_HYPERPARAMETERTUNING_JOB_NAME + ) + + get_hyperparameter_tuning_job_mock.assert_called_once_with( + name=_TEST_HYPERPARAMETERTUNING_JOB_NAME + ) + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_PENDING + ) + + @pytest.mark.parametrize("sync", [True, False]) + def test_create_hyperparameter_tuning_job_with_tensorboard( + self, + create_hyperparameter_tuning_job_v1beta1_mock, + get_hyperparameter_tuning_job_mock, + sync, + ): + + aiplatform.init( + project=_TEST_PROJECT, + location=_TEST_LOCATION, + staging_bucket=_TEST_STAGING_BUCKET, + encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, + ) + + custom_job = aiplatform.CustomJob( + display_name=test_custom_job._TEST_DISPLAY_NAME, + worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, + ) + + job = aiplatform.HyperparameterTuningJob( + display_name=_TEST_DISPLAY_NAME, + custom_job=custom_job, + metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, + parameter_spec={ + "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), + "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), + "activation": hpt.CategoricalParameterSpec( + values=["relu", "sigmoid", "elu", "selu", "tanh"] + ), + "batch_size": hpt.DiscreteParameterSpec( + values=[16, 32], scale="linear" + ), + }, + parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, + max_trial_count=_TEST_MAX_TRIAL_COUNT, + max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, + search_algorithm=_TEST_SEARCH_ALGORITHM, + measurement_selection=_TEST_MEASUREMENT_SELECTION, + ) + + job.run( + service_account=_TEST_SERVICE_ACCOUNT, + network=_TEST_NETWORK, + timeout=_TEST_TIMEOUT, + restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, + tensorboard=test_custom_job._TEST_TENSORBOARD_NAME, + sync=sync, + ) + + job.wait() + + expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto( + version="v1beta1" + ) + + create_hyperparameter_tuning_job_v1beta1_mock.assert_called_once_with( + parent=_TEST_PARENT, + hyperparameter_tuning_job=expected_hyperparameter_tuning_job, + ) + + assert ( + job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED + ) diff --git a/tests/unit/aiplatform/test_training_jobs.py b/tests/unit/aiplatform/test_training_jobs.py index 8fd82c7727..75478263e8 100644 --- a/tests/unit/aiplatform/test_training_jobs.py +++ b/tests/unit/aiplatform/test_training_jobs.py @@ -30,6 +30,9 @@ from google.auth import credentials as auth_credentials +from google.cloud.aiplatform import utils +from google.cloud.aiplatform.utils import source_utils +from google.cloud.aiplatform.utils import worker_spec_utils from google.cloud import aiplatform from google.cloud.aiplatform import datasets @@ -234,7 +237,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_bucket( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -261,7 +264,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_gcs_path( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH_WITH_TRAILING_SLASH, project=_TEST_PROJECT, @@ -289,7 +292,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client_with_trailing_slash( mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_GCS_PATH, project=_TEST_PROJECT, @@ -315,7 +318,7 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - gcs_path = training_jobs._timestamped_copy_to_gcs( + gcs_path = utils._timestamped_copy_to_gcs( local_file_path=_TEST_LOCAL_SCRIPT_FILE_PATH, gcs_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT, @@ -332,10 +335,10 @@ def test_timestamp_copy_to_gcs_calls_gcs_client(self, mock_client_bucket): def test_get_python_executable_raises_if_None(self): with patch.object(sys, "executable", new=None): with pytest.raises(EnvironmentError): - training_jobs._get_python_executable() + source_utils._get_python_executable() def test_get_python_executable_returns_python_executable(self): - assert "python" in training_jobs._get_python_executable().lower() + assert "python" in source_utils._get_python_executable().lower() class TestTrainingScriptPythonPackager: @@ -347,7 +350,7 @@ def setup_method(self): def teardown_method(self): pathlib.Path(_TEST_LOCAL_SCRIPT_FILE_NAME).unlink() - python_package_file = f"{training_jobs._TrainingScriptPythonPackager._ROOT_MODULE}-{training_jobs._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" + python_package_file = f"{source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}.tar.gz" if pathlib.Path(python_package_file).is_file(): pathlib.Path(python_package_file).unlink() subprocess.check_output( @@ -355,34 +358,34 @@ def teardown_method(self): "pip3", "uninstall", "-y", - training_jobs._TrainingScriptPythonPackager._ROOT_MODULE, + source_utils._TrainingScriptPythonPackager._ROOT_MODULE, ] ) def test_packager_creates_and_copies_python_package(self): - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) tsp.package_and_copy(copy_method=local_copy_method) assert pathlib.Path( f"{tsp._ROOT_MODULE}-{tsp._SETUP_PY_VERSION}.tar.gz" ).is_file() def test_created_package_module_is_installable_and_can_be_run(self): - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) subprocess.check_output(["pip3", "install", source_dist_path]) module_output = subprocess.check_output( - [training_jobs._get_python_executable(), "-m", tsp.module_name] + [source_utils._get_python_executable(), "-m", tsp.module_name] ) assert "hello world" in module_output.decode() def test_requirements_are_in_package(self): - tsp = training_jobs._TrainingScriptPythonPackager( + tsp = source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME, requirements=_TEST_REQUIREMENTS ) source_dist_path = tsp.package_and_copy(copy_method=local_copy_method) with tarfile.open(source_dist_path) as tf: with tempfile.TemporaryDirectory() as tmpdirname: - setup_py_path = f"{training_jobs._TrainingScriptPythonPackager._ROOT_MODULE}-{training_jobs._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" + setup_py_path = f"{source_utils._TrainingScriptPythonPackager._ROOT_MODULE}-{source_utils._TrainingScriptPythonPackager._SETUP_PY_VERSION}/setup.py" tf.extract(setup_py_path, path=tmpdirname) setup_py = core.run_setup( pathlib.Path(tmpdirname, setup_py_path), stop_after="init" @@ -395,7 +398,7 @@ def test_packaging_fails_whith_RuntimeError(self): mock_subprocess.communicate.return_value = (b"", b"") mock_subprocess.returncode = 1 mock_popen.return_value = mock_subprocess - tsp = training_jobs._TrainingScriptPythonPackager( + tsp = source_utils._TrainingScriptPythonPackager( _TEST_LOCAL_SCRIPT_FILE_NAME ) with pytest.raises(RuntimeError): @@ -404,7 +407,7 @@ def test_packaging_fails_whith_RuntimeError(self): def test_package_and_copy_to_gcs_copies_to_gcs(self, mock_client_bucket): mock_client_bucket, mock_blob = mock_client_bucket - tsp = training_jobs._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) + tsp = source_utils._TrainingScriptPythonPackager(_TEST_LOCAL_SCRIPT_FILE_NAME) gcs_path = tsp.package_and_copy_to_gcs( gcs_staging_dir=_TEST_BUCKET_NAME, project=_TEST_PROJECT @@ -512,7 +515,7 @@ def mock_model_service_get(): @pytest.fixture def mock_python_package_to_gcs(): with mock.patch.object( - training_jobs._TrainingScriptPythonPackager, "package_and_copy_to_gcs" + source_utils._TrainingScriptPythonPackager, "package_and_copy_to_gcs" ) as mock_package_to_copy_gcs: mock_package_to_copy_gcs.return_value = _TEST_OUTPUT_PYTHON_PACKAGE_PATH yield mock_package_to_copy_gcs @@ -630,16 +633,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -699,9 +702,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -789,16 +794,16 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -858,8 +863,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1064,16 +1071,16 @@ def test_run_call_pipeline_service_create_with_no_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -1094,8 +1101,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1316,31 +1325,31 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -1382,8 +1391,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1544,16 +1555,16 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": training_jobs._TrainingScriptPythonPackager.module_name, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": source_utils._TrainingScriptPythonPackager.module_name, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -1609,8 +1620,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1784,11 +1797,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -1852,8 +1865,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -1932,11 +1947,11 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -1999,8 +2014,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2189,11 +2206,11 @@ def test_run_call_pipeline_service_create_with_no_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2217,8 +2234,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2418,11 +2437,11 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2431,11 +2450,11 @@ def test_run_call_pipeline_service_create_distributed_training( }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2480,8 +2499,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -2558,11 +2579,11 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, "containerSpec": { "imageUri": _TEST_TRAINING_CONTAINER_IMAGE, @@ -2622,9 +2643,11 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -2689,7 +2712,7 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset_raises_if_anno class Test_MachineSpec: def test_machine_spec_return_spec_dict(self): - test_spec = training_jobs._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2697,18 +2720,18 @@ def test_machine_spec_return_spec_dict(self): ) true_spec_dict = { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": _TEST_REPLICA_COUNT, + "replica_count": _TEST_REPLICA_COUNT, } assert test_spec.spec_dict == true_spec_dict def test_machine_spec_return_spec_dict_with_no_accelerator(self): - test_spec = training_jobs._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=0, @@ -2716,14 +2739,14 @@ def test_machine_spec_return_spec_dict_with_no_accelerator(self): ) true_spec_dict = { - "machineSpec": {"machineType": _TEST_MACHINE_TYPE}, - "replicaCount": _TEST_REPLICA_COUNT, + "machine_spec": {"machine_type": _TEST_MACHINE_TYPE}, + "replica_count": _TEST_REPLICA_COUNT, } assert test_spec.spec_dict == true_spec_dict def test_machine_spec_spec_dict_raises_invalid_accelerator(self): - test_spec = training_jobs._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2734,7 +2757,7 @@ def test_machine_spec_spec_dict_raises_invalid_accelerator(self): test_spec.spec_dict def test_machine_spec_spec_dict_is_empty(self): - test_spec = training_jobs._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=0, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2744,7 +2767,7 @@ def test_machine_spec_spec_dict_is_empty(self): assert test_spec.is_empty def test_machine_spec_spec_dict_is_not_empty(self): - test_spec = training_jobs._MachineSpec( + test_spec = worker_spec_utils._MachineSpec( replica_count=_TEST_REPLICA_COUNT, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2757,26 +2780,26 @@ def test_machine_spec_spec_dict_is_not_empty(self): class Test_DistributedTrainingSpec: def test_machine_spec_returns_pool_spec(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=training_jobs._MachineSpec( + worker_spec=worker_spec_utils._MachineSpec( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - parameter_server_spec=training_jobs._MachineSpec( + parameter_server_spec=worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=training_jobs._MachineSpec( + evaluator_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2786,36 +2809,36 @@ def test_machine_spec_returns_pool_spec(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 10, + "replica_count": 10, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 3, + "replica_count": 3, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, ] @@ -2823,7 +2846,7 @@ def test_machine_spec_returns_pool_spec(self): def test_chief_worker_pool_returns_spec(self): - chief_worker_spec = training_jobs._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=10, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2832,20 +2855,20 @@ def test_chief_worker_pool_returns_spec(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 9, + "replica_count": 9, }, ] @@ -2853,7 +2876,7 @@ def test_chief_worker_pool_returns_spec(self): def test_chief_worker_pool_returns_just_chief(self): - chief_worker_spec = training_jobs._DistributedTrainingSpec.chief_worker_pool( + chief_worker_spec = worker_spec_utils._DistributedTrainingSpec.chief_worker_pool( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2862,12 +2885,12 @@ def test_chief_worker_pool_returns_just_chief(self): true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, } ] @@ -2875,8 +2898,8 @@ def test_chief_worker_pool_returns_just_chief(self): def test_machine_spec_raise_with_more_than_one_chief_replica(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=2, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, @@ -2889,40 +2912,40 @@ def test_machine_spec_raise_with_more_than_one_chief_replica(self): def test_machine_spec_handles_missing_pools(self): - spec = training_jobs._DistributedTrainingSpec( - chief_spec=training_jobs._MachineSpec( + spec = worker_spec_utils._DistributedTrainingSpec( + chief_spec=worker_spec_utils._MachineSpec( replica_count=1, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - worker_spec=training_jobs._MachineSpec(replica_count=0), - parameter_server_spec=training_jobs._MachineSpec( + worker_spec=worker_spec_utils._MachineSpec(replica_count=0), + parameter_server_spec=worker_spec_utils._MachineSpec( replica_count=3, machine_type=_TEST_MACHINE_TYPE, accelerator_count=_TEST_ACCELERATOR_COUNT, accelerator_type=_TEST_ACCELERATOR_TYPE, ), - evaluator_spec=training_jobs._MachineSpec(replica_count=0), + evaluator_spec=worker_spec_utils._MachineSpec(replica_count=0), ) true_pool_spec = [ { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 1, + "replica_count": 1, }, - {"machineSpec": {"machineType": "n1-standard-4"}, "replicaCount": 0}, + {"machine_spec": {"machine_type": "n1-standard-4"}, "replica_count": 0}, { - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "replicaCount": 3, + "replica_count": 3, }, ] @@ -2999,16 +3022,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( ] true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, "env": true_env, }, @@ -3068,9 +3091,11 @@ def test_run_call_pipeline_service_create_with_tabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, - "serviceAccount": _TEST_SERVICE_ACCOUNT, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, + "service_account": _TEST_SERVICE_ACCOUNT, "network": _TEST_NETWORK, }, struct_pb2.Value(), @@ -3152,16 +3177,16 @@ def test_run_call_pipeline_service_create_with_tabular_dataset_without_model_dis true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3220,8 +3245,10 @@ def test_run_call_pipeline_service_create_with_tabular_dataset_without_model_dis training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3301,16 +3328,16 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3369,8 +3396,10 @@ def test_run_call_pipeline_service_create_with_bigquery_destination( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3563,16 +3592,16 @@ def test_run_call_pipeline_service_create_with_no_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -3592,8 +3621,10 @@ def test_run_call_pipeline_service_create_with_no_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3799,30 +3830,30 @@ def test_run_call_pipeline_service_create_distributed_training( true_worker_pool_spec = [ { - "replicaCount": 1, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 1, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, }, { - "replicaCount": 9, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": 9, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, }, @@ -3863,8 +3894,10 @@ def test_run_call_pipeline_service_create_distributed_training( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": true_worker_pool_spec, - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": true_worker_pool_spec, + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), @@ -3940,16 +3973,16 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( true_args = _TEST_RUN_ARGS true_worker_pool_spec = { - "replicaCount": _TEST_REPLICA_COUNT, - "machineSpec": { - "machineType": _TEST_MACHINE_TYPE, - "acceleratorType": _TEST_ACCELERATOR_TYPE, - "acceleratorCount": _TEST_ACCELERATOR_COUNT, + "replica_count": _TEST_REPLICA_COUNT, + "machine_spec": { + "machine_type": _TEST_MACHINE_TYPE, + "accelerator_type": _TEST_ACCELERATOR_TYPE, + "accelerator_count": _TEST_ACCELERATOR_COUNT, }, - "pythonPackageSpec": { - "executorImageUri": _TEST_TRAINING_CONTAINER_IMAGE, - "pythonModule": _TEST_PYTHON_MODULE_NAME, - "packageUris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], + "python_package_spec": { + "executor_image_uri": _TEST_TRAINING_CONTAINER_IMAGE, + "python_module": _TEST_PYTHON_MODULE_NAME, + "package_uris": [_TEST_OUTPUT_PYTHON_PACKAGE_PATH], "args": true_args, }, } @@ -4005,8 +4038,10 @@ def test_run_call_pipeline_service_create_with_nontabular_dataset( training_task_definition=schema.training_job.definition.custom_task, training_task_inputs=json_format.ParseDict( { - "workerPoolSpecs": [true_worker_pool_spec], - "baseOutputDirectory": {"output_uri_prefix": _TEST_BASE_OUTPUT_DIR}, + "worker_pool_specs": [true_worker_pool_spec], + "base_output_directory": { + "output_uri_prefix": _TEST_BASE_OUTPUT_DIR + }, }, struct_pb2.Value(), ), diff --git a/tests/unit/aiplatform/test_training_utils.py b/tests/unit/aiplatform/test_training_utils.py deleted file mode 100644 index 1d4b839151..0000000000 --- a/tests/unit/aiplatform/test_training_utils.py +++ /dev/null @@ -1,144 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import json -import os -import pytest - -from google.cloud.aiplatform import training_utils -from unittest import mock - -_TEST_TRAINING_DATA_URI = "gs://training-data-uri" -_TEST_VALIDATION_DATA_URI = "gs://test-validation-data-uri" -_TEST_TEST_DATA_URI = "gs://test-data-uri" -_TEST_MODEL_DIR = "gs://test-model-dir" -_TEST_CHECKPOINT_DIR = "gs://test-checkpoint-dir" -_TEST_TENSORBOARD_LOG_DIR = "gs://test-tensorboard-log-dir" -_TEST_CLUSTER_SPEC = """{ - "cluster": { - "worker_pools":[ - { - "index":0, - "replicas":[ - "training-workerpool0-ab-0:2222" - ] - }, - { - "index":1, - "replicas":[ - "training-workerpool1-ab-0:2222", - "training-workerpool1-ab-1:2222" - ] - } - ] - }, - "environment": "cloud", - "task": { - "worker_pool_index":0, - "replica_index":0, - "trial":"TRIAL_ID" - } -}""" - - -class TestTrainingUtils: - @pytest.fixture - def mock_environment(self): - env_vars = { - "AIP_TRAINING_DATA_URI": _TEST_TRAINING_DATA_URI, - "AIP_VALIDATION_DATA_URI": _TEST_VALIDATION_DATA_URI, - "AIP_TEST_DATA_URI": _TEST_TEST_DATA_URI, - "AIP_MODEL_DIR": _TEST_MODEL_DIR, - "AIP_CHECKPOINT_DIR": _TEST_CHECKPOINT_DIR, - "AIP_TENSORBOARD_LOG_DIR": _TEST_TENSORBOARD_LOG_DIR, - "CLUSTER_SPEC": _TEST_CLUSTER_SPEC, - "TF_CONFIG": _TEST_CLUSTER_SPEC, - } - with mock.patch.dict(os.environ, env_vars): - yield - - @pytest.mark.usefixtures("mock_environment") - def test_training_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.training_data_uri == _TEST_TRAINING_DATA_URI - - def test_training_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.training_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_validation_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.validation_data_uri == _TEST_VALIDATION_DATA_URI - - def test_validation_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.validation_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_test_data_uri(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.test_data_uri == _TEST_TEST_DATA_URI - - def test_test_data_uri_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.test_data_uri is None - - @pytest.mark.usefixtures("mock_environment") - def test_model_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.model_dir == _TEST_MODEL_DIR - - def test_model_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.model_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_checkpoint_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.checkpoint_dir == _TEST_CHECKPOINT_DIR - - def test_checkpoint_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.checkpoint_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_tensorboard_log_dir(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tensorboard_log_dir == _TEST_TENSORBOARD_LOG_DIR - - def test_tensorboard_log_dir_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tensorboard_log_dir is None - - @pytest.mark.usefixtures("mock_environment") - def test_cluster_spec(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.cluster_spec == json.loads(_TEST_CLUSTER_SPEC) - - def test_cluster_spec_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.cluster_spec is None - - @pytest.mark.usefixtures("mock_environment") - def test_tf_config(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tf_config == json.loads(_TEST_CLUSTER_SPEC) - - def test_tf_config_none(self): - env_vars = training_utils.EnvironmentVariables() - assert env_vars.tf_config is None