googleapis · chelsea-lin · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
@@ -913,6 +913,7 @@ def predict(
         max_output_tokens: int = 8192,
         top_k: int = 40,
         top_p: float = 1.0,
+        ground_with_google_search: bool = False,
     ) -> bpd.DataFrame:
         """Predict the result from input DataFrame.
 
@@ -936,11 +937,20 @@ def predict(
                 Specify a lower value for less random responses and a higher value for more random responses.
                 Default 40. Possible values [1, 40].
 
-            top_p (float, default 0.95)::
+            top_p (float, default 0.95):
                 Top-P changes how the model selects tokens for output. Tokens are selected from the most (see top-K) to least probable until the sum of their probabilities equals the top-P value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-P value is 0.5, then the model will select either A or B as the next token by using temperature and excludes C as a candidate.
                 Specify a lower value for less random responses and a higher value for more random responses.
                 Default 1.0. Possible values [0.0, 1.0].
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the Vertex AI model. When set
+                to True, the model incorporates relevant information from Google Search
+                results into its responses, enhancing their accuracy and factualness.
+                This feature provides an additional column, `ml_generate_text_grounding_result`,
+                in the response output, detailing the sources used for grounding.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
 
         Returns:
             bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values.
@@ -974,6 +984,7 @@ def predict(
             "top_k": top_k,
             "top_p": top_p,
             "flatten_json_output": True,
+            "ground_with_google_search": ground_with_google_search,
         }
 
         df = self._bqml_model.generate_text(X, options)

@@ -16,6 +16,7 @@
 import re
 import typing
 from typing import List, Optional
+import warnings
 
 import numpy as np
 
@@ -39,6 +40,7 @@ def agg(
         model,
         cluster_column: typing.Optional[str] = None,
         max_agg_rows: int = 10,
+        ground_with_google_search: bool = False,
     ):
         """
         Performs an aggregation over all rows of the table.
@@ -90,6 +92,14 @@ def agg(
             max_agg_rows (int, default 10):
                 The maxinum number of rows to be aggregated at a time.
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the GeminiTextGenerator model.
+                When set to True, the model incorporates relevant information from Google
+                Search results into its responses, enhancing their accuracy and factualness.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
+
         Returns:
             bigframes.dataframe.DataFrame: A new DataFrame with the aggregated answers.
 
@@ -119,6 +129,12 @@ def agg(
             )
         column = columns[0]
 
+        if ground_with_google_search:
+            warnings.warn(
+                "Enables Grounding with Google Search may impact billing cost. See pricing "
+                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
+            )
+
         if max_agg_rows <= 1:
             raise ValueError(
                 f"Invalid value for `max_agg_rows`: {max_agg_rows}."
@@ -191,7 +207,12 @@ def agg(
 
             # Run model
             predict_df = typing.cast(
-                bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0)
+                bigframes.dataframe.DataFrame,
+                model.predict(
+                    prompt_s,
+                    temperature=0.0,
+                    ground_with_google_search=ground_with_google_search,
+                ),
             )
             agg_df[column] = predict_df["ml_generate_text_llm_result"].combine_first(
                 single_row_df
@@ -284,7 +305,7 @@ def cluster_by(
         df[output_column] = clustered_result["CENTROID_ID"]
         return df
 
-    def filter(self, instruction: str, model):
+    def filter(self, instruction: str, model, ground_with_google_search: bool = False):
         """
         Filters the DataFrame with the semantics of the user instruction.
 
@@ -305,18 +326,26 @@ def filter(self, instruction: str, model):
             [1 rows x 2 columns]
 
         Args:
-            instruction:
+            instruction (str):
                 An instruction on how to filter the data. This value must contain
                 column references by name, which should be wrapped in a pair of braces.
                 For example, if you have a column "food", you can refer to this column
                 in the instructions like:
                 "The {food} is healthy."
 
-            model:
+            model (bigframes.ml.llm.GeminiTextGenerator):
                 A GeminiTextGenerator provided by Bigframes ML package.
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the GeminiTextGenerator model.
+                When set to True, the model incorporates relevant information from Google
+                Search results into its responses, enhancing their accuracy and factualness.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
+
         Returns:
-            DataFrame filtered by the instruction.
+            bigframes.pandas.DataFrame: DataFrame filtered by the instruction.
 
         Raises:
             NotImplementedError: when the semantic operator experiment is off.
@@ -332,6 +361,12 @@ def filter(self, instruction: str, model):
             if column not in self._df.columns:
                 raise ValueError(f"Column {column} not found.")
 
+        if ground_with_google_search:
+            warnings.warn(
+                "Enables Grounding with Google Search may impact billing cost. See pricing "
+                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
+            )
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         for column in columns:
             if df[column].dtype != dtypes.STRING_DTYPE:
@@ -345,14 +380,21 @@ def filter(self, instruction: str, model):
             model.predict(
                 self._make_prompt(df, columns, user_instruction, output_instruction),
                 temperature=0.0,
+                ground_with_google_search=ground_with_google_search,
             ),
         )
 
         return self._df[
             results["ml_generate_text_llm_result"].str.lower().str.contains("true")
         ]
 
-    def map(self, instruction: str, output_column: str, model):
+    def map(
+        self,
+        instruction: str,
+        output_column: str,
+        model,
+        ground_with_google_search: bool = False,
+    ):
         """
         Maps the DataFrame with the semantics of the user instruction.
 
@@ -376,21 +418,29 @@ def map(self, instruction: str, output_column: str, model):
             [2 rows x 3 columns]
 
         Args:
-            instruction:
+            instruction (str):
                 An instruction on how to map the data. This value must contain
                 column references by name, which should be wrapped in a pair of braces.
                 For example, if you have a column "food", you can refer to this column
                 in the instructions like:
                 "Get the ingredients of {food}."
 
-            output_column:
+            output_column (str):
                 The column name of the mapping result.
 
-            model:
+            model (bigframes.ml.llm.GeminiTextGenerator):
                 A GeminiTextGenerator provided by Bigframes ML package.
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the GeminiTextGenerator model.
+                When set to True, the model incorporates relevant information from Google
+                Search results into its responses, enhancing their accuracy and factualness.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
+
         Returns:
-            DataFrame with attached mapping results.
+            bigframes.pandas.DataFrame: DataFrame with attached mapping results.
 
         Raises:
             NotImplementedError: when the semantic operator experiment is off.
@@ -406,6 +456,12 @@ def map(self, instruction: str, output_column: str, model):
             if column not in self._df.columns:
                 raise ValueError(f"Column {column} not found.")
 
+        if ground_with_google_search:
+            warnings.warn(
+                "Enables Grounding with Google Search may impact billing cost. See pricing "
+                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
+            )
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         for column in columns:
             if df[column].dtype != dtypes.STRING_DTYPE:
@@ -421,14 +477,22 @@ def map(self, instruction: str, output_column: str, model):
             model.predict(
                 self._make_prompt(df, columns, user_instruction, output_instruction),
                 temperature=0.0,
+                ground_with_google_search=ground_with_google_search,
             )["ml_generate_text_llm_result"],
         )
 
         from bigframes.core.reshape import concat
 
         return concat([self._df, results.rename(output_column)], axis=1)
 
-    def join(self, other, instruction: str, model, max_rows: int = 1000):
+    def join(
+        self,
+        other,
+        instruction: str,
+        model,
+        max_rows: int = 1000,
+        ground_with_google_search: bool = False,
+    ):
         """
         Joines two dataframes by applying the instruction over each pair of rows from
         the left and right table.
@@ -455,10 +519,10 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
             [4 rows x 2 columns]
 
         Args:
-            other:
+            other (bigframes.pandas.DataFrame):
                 The other dataframe.
 
-            instruction:
+            instruction (str):
                 An instruction on how left and right rows can be joined. This value must contain
                 column references by name. which should be wrapped in a pair of braces.
                 For example: "The {city} belongs to the {country}".
@@ -467,22 +531,36 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
                 self joins. For example: "The {left.employee_name} reports to {right.employee_name}"
                 For unique column names, this prefix is optional.
 
-            model:
+            model (bigframes.ml.llm.GeminiTextGenerator):
                 A GeminiTextGenerator provided by Bigframes ML package.
 
-            max_rows:
+            max_rows (int, default 1000):
                 The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
                 call will end early with an error.
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the GeminiTextGenerator model.
+                When set to True, the model incorporates relevant information from Google
+                Search results into its responses, enhancing their accuracy and factualness.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
+
         Returns:
-            The joined dataframe.
+            bigframes.pandas.DataFrame: The joined dataframe.
 
         Raises:
             ValueError if the amount of data that will be sent for LLM processing is larger than max_rows.
         """
         self._validate_model(model)
         columns = self._parse_columns(instruction)
 
+        if ground_with_google_search:
+            warnings.warn(
+                "Enables Grounding with Google Search may impact billing cost. See pricing "
+                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
+            )
+
         joined_table_rows = len(self._df) * len(other)
 
         if joined_table_rows > max_rows:
@@ -545,7 +623,9 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
 
         joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))
 
-        return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
+        return joined_df.semantics.filter(
+            instruction, model, ground_with_google_search=ground_with_google_search
+        ).reset_index(drop=True)
 
     def search(
         self,
@@ -644,7 +724,13 @@ def search(
 
         return typing.cast(bigframes.dataframe.DataFrame, search_result)
 
-    def top_k(self, instruction: str, model, k=10):
+    def top_k(
+        self,
+        instruction: str,
+        model,
+        k: int = 10,
+        ground_with_google_search: bool = False,
+    ):
         """
         Ranks each tuple and returns the k best according to the instruction.
 
@@ -682,6 +768,14 @@ def top_k(self, instruction: str, model, k=10):
             k (int, default 10):
                 The number of rows to return.
 
+            ground_with_google_search (bool, default False):
+                Enables Grounding with Google Search for the GeminiTextGenerator model.
+                When set to True, the model incorporates relevant information from Google
+                Search results into its responses, enhancing their accuracy and factualness.
+                Note: Using this feature may impact billing costs. Refer to the pricing
+                page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
+                The default is `False`.
+
         Returns:
             bigframes.dataframe.DataFrame: A new DataFrame with the top k rows.
 
@@ -703,6 +797,12 @@ def top_k(self, instruction: str, model, k=10):
                 "Semantic aggregations are limited to a single column."
             )
 
+        if ground_with_google_search:
+            warnings.warn(
+                "Enables Grounding with Google Search may impact billing cost. See pricing "
+                "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
+            )
+
         df: bigframes.dataframe.DataFrame = self._df[columns].copy()
         column = columns[0]
         if df[column].dtype != dtypes.STRING_DTYPE:
@@ -743,6 +843,7 @@ def top_k(self, instruction: str, model, k=10):
                 user_instruction,
                 model,
                 k - num_selected,
+                ground_with_google_search,
             )
             num_selected += num_new_selected
 
@@ -757,7 +858,13 @@ def top_k(self, instruction: str, model, k=10):
 
     @staticmethod
     def _topk_partition(
-        df, column: str, status_column: str, user_instruction: str, model, k
+        df,
+        column: str,
+        status_column: str,
+        user_instruction: str,
+        model,
+        k: int,
+        ground_with_google_search: bool,
     ):
         output_instruction = (
             "Given a question and two documents, choose the document that best answers "
@@ -784,7 +891,12 @@ def _topk_partition(
         import bigframes.dataframe
 
         predict_df = typing.cast(
-            bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0)
+            bigframes.dataframe.DataFrame,
+            model.predict(
+                prompt_s,
+                temperature=0.0,
+                ground_with_google_search=ground_with_google_search,
+            ),
         )
 
         marks = predict_df["ml_generate_text_llm_result"].str.contains("2")