LLMExtractor
Bases: object
LLM Extractor
This is an extractor utility useful for leveraging the llm ability to extract automatically data from html. The data must be explained throught a pydantic model, this extractor use the rule to parse and validate correctly the needed entities. It uses an OpenAI llm model.
Attributes: |
|
---|
Methods:
Name | Description |
---|---|
__init__ |
Bot, pydantic_model: BaseModel): Initialized the LLMExtractor class. |
extract_data |
str) -> str: Extract the needed data. |
Source code in fastbots/llm_extractor.py
class LLMExtractor(object):
"""
LLM Extractor
This is an extractor utility useful for leveraging the llm ability to extract automatically data from html.
The data must be explained throught a pydantic model, this extractor use the rule to parse and validate correctly
the needed entities.
It uses an OpenAI llm model.
Attributes:
_bot (Bot): The bot instance associated with the extractor.
_pydantic_model (BaseModel): The representation of the data needed to extract and validate the parsed data.
Methods:
__init__(self, bot: Bot, pydantic_model: BaseModel): Initialized the LLMExtractor class.
extract_data(self, locator_name: str) -> str: Extract the needed data.
"""
def __init__(self, bot: Bot, pydantic_model: BaseModel) -> None:
"""
Initializes the LLMExtractor class.
Args:
bot (Bot): The bot instance associated with the extractor.
pydantic_model (BaseModel): The representation of the data needed to extract and validate the parsed data.
"""
super().__init__()
self._bot: Bot = bot
llm_model = ChatOpenAI(
temperature=0,
model="gpt-3.5-turbo",
openai_api_key=config.OPENAI_API_KEY
)
prompt_template = """ given this information {information} of an entity on this piece of html,
I want you to extract all the information about this entity.
You are not allowed to make any assumptions while extracting the information.
Every link you provide should be from the information given.
There should be no assumptions for Links/URLS.
You should not return code to do it.:
You should extract the following text infromation from the html:
\n{format_instructions} # here we are passing format_instructions
"""
json_output_parser = JsonOutputParser(
pydantic_model=pydantic_model
)
prompt = PromptTemplate(
template=prompt_template,
input_variables=["information"],
partial_variables={"format_instructions": json_output_parser.get_format_instructions()},
)
self._llm_chain = LLMChain(llm=llm_model, prompt=prompt)
def __locator__(self, locator_name: str) -> tuple:
"""
Utility method to load a locator.
The locators in the file must be in the format:
[llm_extractor]
locator_name=(By.XPATH, "//html//input")
Args:
locator_name (str): The name of the locator.
Returns:
tuple: A tuple representing the loaded locator.
Raises:
ValueError: If the locator is not enclosed in round brackets or is of an unknown or incorrect format.
"""
# load the locators from file and interpret that as code
full_locator: str = self._bot.locator('llm_extractor', locator_name).strip().replace('\\\'', '\'').replace('\\"', '"')
if not full_locator.startswith('(') or not full_locator.endswith(')'):
raise ValueError('The locator must be enclosed in round brackets.')
# declared locators
locator_list: List[str] = [
'By.ID', 'By.XPATH', 'By.NAME', 'By.CLASS_NAME', 'By.CSS_SELECTOR',
'By.LINK_TEXT', 'By.PARTIAL_LINK_TEXT', 'By.TAG_NAME'
]
# check the used locator
parsed_locator: tuple = None
for locator in locator_list:
# check that the first characters are them of the locators and the next one of the comma
if full_locator[1:-1].strip().startswith(locator) and full_locator[1:-1].strip()[len(locator):].strip().startswith(','):
# extract the tuple required as locator
parsed_locator = (
eval(locator),
full_locator[1:-1].strip()[len(locator):].strip()[1:].strip()[1:-1]
)
logging.debug(f'{locator_name} {parsed_locator}')
return parsed_locator
else:
raise ValueError('The specified locator is unknown or wrong; check by, brackets, and commas.')
def extract_data(self, locator_name: str) -> str:
"""
Extract the data as a json string, validated throught the data format specified by the pydantic model.
The locators in the file must be in the format:
[llm_extractor]
locator_name=(By.XPATH, "//html//input")
Args:
locator_name (str): The name of the locator.
"""
try:
extracted_data = self._llm_chain.invoke(
input={"information": self._bot.wait.until(EC.presence_of_element_located(self.__locator__(locator_name))).get_attribute('innerHTML')},
return_only_outputs=True,
)
return extracted_data["text"]
except Exception as e:
logging.error(e)
return None
__init__(bot, pydantic_model)
Initializes the LLMExtractor class.
Parameters: |
|
---|
Source code in fastbots/llm_extractor.py
def __init__(self, bot: Bot, pydantic_model: BaseModel) -> None:
"""
Initializes the LLMExtractor class.
Args:
bot (Bot): The bot instance associated with the extractor.
pydantic_model (BaseModel): The representation of the data needed to extract and validate the parsed data.
"""
super().__init__()
self._bot: Bot = bot
llm_model = ChatOpenAI(
temperature=0,
model="gpt-3.5-turbo",
openai_api_key=config.OPENAI_API_KEY
)
prompt_template = """ given this information {information} of an entity on this piece of html,
I want you to extract all the information about this entity.
You are not allowed to make any assumptions while extracting the information.
Every link you provide should be from the information given.
There should be no assumptions for Links/URLS.
You should not return code to do it.:
You should extract the following text infromation from the html:
\n{format_instructions} # here we are passing format_instructions
"""
json_output_parser = JsonOutputParser(
pydantic_model=pydantic_model
)
prompt = PromptTemplate(
template=prompt_template,
input_variables=["information"],
partial_variables={"format_instructions": json_output_parser.get_format_instructions()},
)
self._llm_chain = LLMChain(llm=llm_model, prompt=prompt)
__locator__(locator_name)
Utility method to load a locator.
The locators in the file must be in the format: [llm_extractor] locator_name=(By.XPATH, "//html//input")
Parameters: |
|
---|
Returns: |
|
---|
Raises: |
|
---|
Source code in fastbots/llm_extractor.py
def __locator__(self, locator_name: str) -> tuple:
"""
Utility method to load a locator.
The locators in the file must be in the format:
[llm_extractor]
locator_name=(By.XPATH, "//html//input")
Args:
locator_name (str): The name of the locator.
Returns:
tuple: A tuple representing the loaded locator.
Raises:
ValueError: If the locator is not enclosed in round brackets or is of an unknown or incorrect format.
"""
# load the locators from file and interpret that as code
full_locator: str = self._bot.locator('llm_extractor', locator_name).strip().replace('\\\'', '\'').replace('\\"', '"')
if not full_locator.startswith('(') or not full_locator.endswith(')'):
raise ValueError('The locator must be enclosed in round brackets.')
# declared locators
locator_list: List[str] = [
'By.ID', 'By.XPATH', 'By.NAME', 'By.CLASS_NAME', 'By.CSS_SELECTOR',
'By.LINK_TEXT', 'By.PARTIAL_LINK_TEXT', 'By.TAG_NAME'
]
# check the used locator
parsed_locator: tuple = None
for locator in locator_list:
# check that the first characters are them of the locators and the next one of the comma
if full_locator[1:-1].strip().startswith(locator) and full_locator[1:-1].strip()[len(locator):].strip().startswith(','):
# extract the tuple required as locator
parsed_locator = (
eval(locator),
full_locator[1:-1].strip()[len(locator):].strip()[1:].strip()[1:-1]
)
logging.debug(f'{locator_name} {parsed_locator}')
return parsed_locator
else:
raise ValueError('The specified locator is unknown or wrong; check by, brackets, and commas.')
extract_data(locator_name)
Extract the data as a json string, validated throught the data format specified by the pydantic model.
The locators in the file must be in the format: [llm_extractor] locator_name=(By.XPATH, "//html//input")
Parameters: |
|
---|
Source code in fastbots/llm_extractor.py
def extract_data(self, locator_name: str) -> str:
"""
Extract the data as a json string, validated throught the data format specified by the pydantic model.
The locators in the file must be in the format:
[llm_extractor]
locator_name=(By.XPATH, "//html//input")
Args:
locator_name (str): The name of the locator.
"""
try:
extracted_data = self._llm_chain.invoke(
input={"information": self._bot.wait.until(EC.presence_of_element_located(self.__locator__(locator_name))).get_attribute('innerHTML')},
return_only_outputs=True,
)
return extracted_data["text"]
except Exception as e:
logging.error(e)
return None