Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion bigframes/ml/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,7 @@ def predict(
max_output_tokens: int = 8192,
top_k: int = 40,
top_p: float = 1.0,
ground_with_google_search: bool = False,
) -> bpd.DataFrame:
"""Predict the result from input DataFrame.

Expand All @@ -936,11 +937,20 @@ def predict(
Specify a lower value for less random responses and a higher value for more random responses.
Default 40. Possible values [1, 40].

top_p (float, default 0.95)::
top_p (float, default 0.95):
Top-P changes how the model selects tokens for output. Tokens are selected from the most (see top-K) to least probable until the sum of their probabilities equals the top-P value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the top-P value is 0.5, then the model will select either A or B as the next token by using temperature and excludes C as a candidate.
Specify a lower value for less random responses and a higher value for more random responses.
Default 1.0. Possible values [0.0, 1.0].

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the Vertex AI model. When set
to True, the model incorporates relevant information from Google Search
results into its responses, enhancing their accuracy and factualness.
This feature provides an additional column, `ml_generate_text_grounding_result`,
in the response output, detailing the sources used for grounding.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values.
Expand Down Expand Up @@ -974,6 +984,7 @@ def predict(
"top_k": top_k,
"top_p": top_p,
"flatten_json_output": True,
"ground_with_google_search": ground_with_google_search,
}

df = self._bqml_model.generate_text(X, options)
Expand Down
152 changes: 132 additions & 20 deletions bigframes/operations/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import re
import typing
from typing import List, Optional
import warnings

import numpy as np

Expand All @@ -39,6 +40,7 @@ def agg(
model,
cluster_column: typing.Optional[str] = None,
max_agg_rows: int = 10,
ground_with_google_search: bool = False,
):
"""
Performs an aggregation over all rows of the table.
Expand Down Expand Up @@ -90,6 +92,14 @@ def agg(
max_agg_rows (int, default 10):
The maxinum number of rows to be aggregated at a time.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Search results into its responses, enhancing their accuracy and factualness.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
bigframes.dataframe.DataFrame: A new DataFrame with the aggregated answers.

Expand Down Expand Up @@ -119,6 +129,12 @@ def agg(
)
column = columns[0]

if ground_with_google_search:
warnings.warn(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

if max_agg_rows <= 1:
raise ValueError(
f"Invalid value for `max_agg_rows`: {max_agg_rows}."
Expand Down Expand Up @@ -191,7 +207,12 @@ def agg(

# Run model
predict_df = typing.cast(
bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0)
bigframes.dataframe.DataFrame,
model.predict(
prompt_s,
temperature=0.0,
ground_with_google_search=ground_with_google_search,
),
)
agg_df[column] = predict_df["ml_generate_text_llm_result"].combine_first(
single_row_df
Expand Down Expand Up @@ -284,7 +305,7 @@ def cluster_by(
df[output_column] = clustered_result["CENTROID_ID"]
return df

def filter(self, instruction: str, model):
def filter(self, instruction: str, model, ground_with_google_search: bool = False):
"""
Filters the DataFrame with the semantics of the user instruction.

Expand All @@ -305,18 +326,26 @@ def filter(self, instruction: str, model):
[1 rows x 2 columns]

Args:
instruction:
instruction (str):
An instruction on how to filter the data. This value must contain
column references by name, which should be wrapped in a pair of braces.
For example, if you have a column "food", you can refer to this column
in the instructions like:
"The {food} is healthy."

model:
model (bigframes.ml.llm.GeminiTextGenerator):
A GeminiTextGenerator provided by Bigframes ML package.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Search results into its responses, enhancing their accuracy and factualness.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
DataFrame filtered by the instruction.
bigframes.pandas.DataFrame: DataFrame filtered by the instruction.

Raises:
NotImplementedError: when the semantic operator experiment is off.
Expand All @@ -332,6 +361,12 @@ def filter(self, instruction: str, model):
if column not in self._df.columns:
raise ValueError(f"Column {column} not found.")

if ground_with_google_search:
warnings.warn(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
for column in columns:
if df[column].dtype != dtypes.STRING_DTYPE:
Expand All @@ -345,14 +380,21 @@ def filter(self, instruction: str, model):
model.predict(
self._make_prompt(df, columns, user_instruction, output_instruction),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
),
)

return self._df[
results["ml_generate_text_llm_result"].str.lower().str.contains("true")
]

def map(self, instruction: str, output_column: str, model):
def map(
self,
instruction: str,
output_column: str,
model,
ground_with_google_search: bool = False,
):
"""
Maps the DataFrame with the semantics of the user instruction.

Expand All @@ -376,21 +418,29 @@ def map(self, instruction: str, output_column: str, model):
[2 rows x 3 columns]

Args:
instruction:
instruction (str):
An instruction on how to map the data. This value must contain
column references by name, which should be wrapped in a pair of braces.
For example, if you have a column "food", you can refer to this column
in the instructions like:
"Get the ingredients of {food}."

output_column:
output_column (str):
The column name of the mapping result.

model:
model (bigframes.ml.llm.GeminiTextGenerator):
A GeminiTextGenerator provided by Bigframes ML package.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Search results into its responses, enhancing their accuracy and factualness.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
DataFrame with attached mapping results.
bigframes.pandas.DataFrame: DataFrame with attached mapping results.

Raises:
NotImplementedError: when the semantic operator experiment is off.
Expand All @@ -406,6 +456,12 @@ def map(self, instruction: str, output_column: str, model):
if column not in self._df.columns:
raise ValueError(f"Column {column} not found.")

if ground_with_google_search:
warnings.warn(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
for column in columns:
if df[column].dtype != dtypes.STRING_DTYPE:
Expand All @@ -421,14 +477,22 @@ def map(self, instruction: str, output_column: str, model):
model.predict(
self._make_prompt(df, columns, user_instruction, output_instruction),
temperature=0.0,
ground_with_google_search=ground_with_google_search,
)["ml_generate_text_llm_result"],
)

from bigframes.core.reshape import concat

return concat([self._df, results.rename(output_column)], axis=1)

def join(self, other, instruction: str, model, max_rows: int = 1000):
def join(
self,
other,
instruction: str,
model,
max_rows: int = 1000,
ground_with_google_search: bool = False,
):
"""
Joines two dataframes by applying the instruction over each pair of rows from
the left and right table.
Expand All @@ -455,10 +519,10 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
[4 rows x 2 columns]

Args:
other:
other (bigframes.pandas.DataFrame):
The other dataframe.

instruction:
instruction (str):
An instruction on how left and right rows can be joined. This value must contain
column references by name. which should be wrapped in a pair of braces.
For example: "The {city} belongs to the {country}".
Expand All @@ -467,22 +531,36 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):
self joins. For example: "The {left.employee_name} reports to {right.employee_name}"
For unique column names, this prefix is optional.

model:
model (bigframes.ml.llm.GeminiTextGenerator):
A GeminiTextGenerator provided by Bigframes ML package.

max_rows:
max_rows (int, default 1000):
The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
call will end early with an error.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Search results into its responses, enhancing their accuracy and factualness.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
The joined dataframe.
bigframes.pandas.DataFrame: The joined dataframe.

Raises:
ValueError if the amount of data that will be sent for LLM processing is larger than max_rows.
"""
self._validate_model(model)
columns = self._parse_columns(instruction)

if ground_with_google_search:
warnings.warn(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

joined_table_rows = len(self._df) * len(other)

if joined_table_rows > max_rows:
Expand Down Expand Up @@ -545,7 +623,9 @@ def join(self, other, instruction: str, model, max_rows: int = 1000):

joined_df = self._df.merge(other, how="cross", suffixes=("_left", "_right"))

return joined_df.semantics.filter(instruction, model).reset_index(drop=True)
return joined_df.semantics.filter(
instruction, model, ground_with_google_search=ground_with_google_search
).reset_index(drop=True)

def search(
self,
Expand Down Expand Up @@ -644,7 +724,13 @@ def search(

return typing.cast(bigframes.dataframe.DataFrame, search_result)

def top_k(self, instruction: str, model, k=10):
def top_k(
self,
instruction: str,
model,
k: int = 10,
ground_with_google_search: bool = False,
):
"""
Ranks each tuple and returns the k best according to the instruction.

Expand Down Expand Up @@ -682,6 +768,14 @@ def top_k(self, instruction: str, model, k=10):
k (int, default 10):
The number of rows to return.

ground_with_google_search (bool, default False):
Enables Grounding with Google Search for the GeminiTextGenerator model.
When set to True, the model incorporates relevant information from Google
Search results into its responses, enhancing their accuracy and factualness.
Note: Using this feature may impact billing costs. Refer to the pricing
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
The default is `False`.

Returns:
bigframes.dataframe.DataFrame: A new DataFrame with the top k rows.

Expand All @@ -703,6 +797,12 @@ def top_k(self, instruction: str, model, k=10):
"Semantic aggregations are limited to a single column."
)

if ground_with_google_search:
warnings.warn(
"Enables Grounding with Google Search may impact billing cost. See pricing "
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
column = columns[0]
if df[column].dtype != dtypes.STRING_DTYPE:
Expand Down Expand Up @@ -743,6 +843,7 @@ def top_k(self, instruction: str, model, k=10):
user_instruction,
model,
k - num_selected,
ground_with_google_search,
)
num_selected += num_new_selected

Expand All @@ -757,7 +858,13 @@ def top_k(self, instruction: str, model, k=10):

@staticmethod
def _topk_partition(
df, column: str, status_column: str, user_instruction: str, model, k
df,
column: str,
status_column: str,
user_instruction: str,
model,
k: int,
ground_with_google_search: bool,
):
output_instruction = (
"Given a question and two documents, choose the document that best answers "
Expand All @@ -784,7 +891,12 @@ def _topk_partition(
import bigframes.dataframe

predict_df = typing.cast(
bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0)
bigframes.dataframe.DataFrame,
model.predict(
prompt_s,
temperature=0.0,
ground_with_google_search=ground_with_google_search,
),
)

marks = predict_df["ml_generate_text_llm_result"].str.contains("2")
Expand Down
Loading