Method
Our project operates as follows:
1. Vectorize user input query to extract keywords:
def find_keyword(openai : any, question : str) -> str: response = client.chat.completions.create( model="gpt-4-1106-preview", messages=[ #시스템 메시지는 어시스턴트의 동작을 설정(선택사항) {"role": "system", "content": "A system that extracts only one keyword from a given question."}, #어시스턴트가 응답할 수 있는 요청이나 의견을 제공 read more
Theoretical methods
Method
Our project operates as follows:
1. Vectorize user input query to extract keywords:
def find_keyword(openai : any, question : str) -> str:
response = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
#시스템 메시지는 어시스턴트의 동작을 설정(선택사항)
{"role": "system", "content": "A system that extracts only one keyword from a given question."},
#어시스턴트가 응답할 수 있는 요청이나 의견을 제공
{"role": "user", "content": f"""
#######prompt########
Generate only one keyword from the given question.
question : {question}
####example####
question : '손흥민의 최근 뉴스들을 알려줘'
answer : '손흥민'
question : '미국의 경제 상황에 대해서 알려줘'
answer : '미국경제'
####output####
answer :
"""},
]
)
return response.choices[0].message.content
We’ve written a prompt that leverages the GPT API to extract keywords from user queries.
2. Embedding function
def get_embedding(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return client.embeddings.create(input = [text], model=model).data[0].embedding
3. Vectorize user input query to extract keywords:
-
Utilize the NAVER Article API to fetch articles based on keywords and transcribe the text.
-
Utilize the embedding function to embed.
def make_article(response_body):
## 기사 api 받은 데이터 ##
text = response_body.decode('utf-8')
# 정규 표현식 패턴
pattern = r'"link":"(https[^"]+)"'
# 정규 표현식을 사용하여 기사 링크만 추출
links = re.findall(pattern, text)
#'\'문자 제거
links = [text.replace("\\","") for text in links]
###데이터프레임으로 만들기###
articles = pd.DataFrame(columns=['title','text','title_embedding', 'text_embedding'])
for i, link in enumerate(links) :
#news_url = "https://www.wikitree.co.kr/articles/900115"
news_url = link
#article 클래스를 설정하고, 클래스 안에 url을 넣어준다.
article = Article(news_url, language='ko')
#다운로드와 파싱 제공
article.download()
article.parse()
#제목 모음
articles.loc[i] = [article.title, article.text,[],[]]
###임베딩까지###
client = OpenAI()
EMBEDDING_MODEL = "text-embedding-ada-002"
title_data = articles['title'].to_list()
text_data = articles['text'].to_list()
for i in range(len(title_data)):
title_response = get_embedding(title_data[i])
articles['title_embedding'][i] = title_response
text_response = get_embedding(text_data[i])
articles['text_embedding'][i] = text_response
return articles
4. use cosine similarity to select articles:
Extract 5 hypothetical similar articles. We calculate the similarity using the cosine similarity between the word vectors.
# search function
def strings_ranked_by_relatedness(
query: str, #쿼리문 입력
df: pd.DataFrame, #데이터프레임
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), #코사인 유사도 계산
top_n: int = 100
) -> tuple[list[str], list[float]]:
"""Returns a list of strings and relatednesses, sorted from most related to least.
"""
query_embedding = get_embedding(query) #쿼리문 임베딩
strings_and_relatednesses = [ #text와 유사도 계산
(row["title"], row["text"], relatedness_fn(query_embedding, row["title_embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
title_strings, text_strings, relatednesses = zip(*strings_and_relatednesses)
return title_strings[:top_n], text_strings[:top_n], relatednesses[:top_n]
def collect_articles(keyword : str, df : pd.DataFrame, top_n: int = 100):
total_articles = []
title_strings, text_strings, relatednesses = strings_ranked_by_relatedness("손흥민", articles, top_n=5)
for title_strings, text_strings, relatedness in zip(title_strings, text_strings,relatednesses):
#print(f"{relatedness=:.3f}")
#display(title_strings)
#display(text_strings)
total_articles.append(text_strings)
return total_articles
5. collect articles:
Configure a function to generate answers based on the selected articles.
from openai import OpenAI
client = OpenAI()
def generate_data(openai : any, article: str, question : str) -> str:
response = client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
#시스템 메시지는 어시스턴트의 동작을 설정(선택사항)
{"role": "system", "content": "It's a system that generates relevant answers based on news stories related to your question."},
#어시스턴트가 응답할 수 있는 요청이나 의견을 제공
{"role": "user", "content": f"""
#######prompt########
Read an {article} related to a given {question} and answer it appropriately.
"""},
]
)
return response.choices[0].message.content
6. Final function:
def generate_answer(question : str):
#질문에 대한 키워드 찾기
keyword = find_keyword(openai, question)
#기사 검색하기 (네이버 api)
response = search_article(keyword)
#검색된 기사 임베딩
article = make_article(response)
#질문과 유사한 기사 찾기
most_related_article = collect_articles(keyword, article, top_n=5)
#질문에 답변하기
response = generate_data(openai,most_related_article, question)
return response
Comments