Theoretical methods

Method

Our project operates as follows:

1. Vectorize user input query to extract keywords:

def find_keyword(openai : any, question : str) -> str:
    response = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=[
          #시스템 메시지는 어시스턴트의 동작을 설정(선택사항)
        {"role": "system", "content": "A system that extracts only one keyword from a given question."},
          #어시스턴트가 응답할 수 있는 요청이나 의견을 제공
        {"role": "user", "content": f"""
        #######prompt########
        Generate only one keyword from the given question.
        question : {question}

        ####example####
        question : '손흥민의 최근 뉴스들을 알려줘'
        answer : '손흥민'

        question : '미국의 경제 상황에 대해서 알려줘'
        answer : '미국경제'

        ####output####
        answer :

         """},
      ]
    )
    return response.choices[0].message.content

We’ve written a prompt that leverages the GPT API to extract keywords from user queries.

2. Embedding function

embeddings

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

3. Vectorize user input query to extract keywords:

  • Utilize the NAVER Article API to fetch articles based on keywords and transcribe the text.

  • Utilize the embedding function to embed.

def make_article(response_body):
  ## 기사 api 받은 데이터 ##
  text = response_body.decode('utf-8')

  # 정규 표현식 패턴
  pattern = r'"link":"(https[^"]+)"'

  # 정규 표현식을 사용하여 기사 링크만 추출
  links = re.findall(pattern, text)

  #'\'문자 제거
  links = [text.replace("\\","") for text in links]


  ###데이터프레임으로 만들기###
  articles = pd.DataFrame(columns=['title','text','title_embedding', 'text_embedding'])

  for i, link in enumerate(links) :

    #news_url = "https://www.wikitree.co.kr/articles/900115"
    news_url = link

    #article 클래스를 설정하고, 클래스 안에 url을 넣어준다.
    article = Article(news_url, language='ko')

    #다운로드와 파싱 제공
    article.download()
    article.parse()

    #제목 모음
    articles.loc[i] = [article.title, article.text,[],[]]


  ###임베딩까지###
  client = OpenAI()

  EMBEDDING_MODEL = "text-embedding-ada-002"
  title_data = articles['title'].to_list()
  text_data = articles['text'].to_list()
  for i in range(len(title_data)):
    title_response = get_embedding(title_data[i])
    articles['title_embedding'][i] = title_response

    text_response = get_embedding(text_data[i])
    articles['text_embedding'][i] = text_response

  return articles

4. use cosine similarity to select articles:

Extract 5 hypothetical similar articles. We calculate the similarity using the cosine similarity between the word vectors.

# search function
def strings_ranked_by_relatedness(
    query: str, #쿼리문 입력
    df: pd.DataFrame, #데이터프레임
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y), #코사인 유사도 계산
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least.
    """

    query_embedding = get_embedding(query) #쿼리문 임베딩
    strings_and_relatednesses = [ #text와 유사도 계산
        (row["title"], row["text"], relatedness_fn(query_embedding, row["title_embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    title_strings, text_strings, relatednesses = zip(*strings_and_relatednesses)
    return title_strings[:top_n], text_strings[:top_n], relatednesses[:top_n]
def collect_articles(keyword : str, df : pd.DataFrame, top_n: int = 100):
  total_articles = []

  title_strings, text_strings, relatednesses = strings_ranked_by_relatedness("손흥민", articles, top_n=5)
  for title_strings, text_strings,  relatedness in zip(title_strings, text_strings,relatednesses):
    #print(f"{relatedness=:.3f}")
    #display(title_strings)
    #display(text_strings)
    total_articles.append(text_strings)

  return total_articles

5. collect articles:

Configure a function to generate answers based on the selected articles.

from openai import OpenAI
client = OpenAI()

def generate_data(openai : any, article: str, question : str) -> str:
    response = client.chat.completions.create(
      model="gpt-4-1106-preview",
      messages=[
          #시스템 메시지는 어시스턴트의 동작을 설정(선택사항)
        {"role": "system", "content": "It's a system that generates relevant answers based on news stories related to your question."},
          #어시스턴트가 응답할 수 있는 요청이나 의견을 제공
        {"role": "user", "content": f"""
        #######prompt########
        Read an {article} related to a given {question} and answer it appropriately.
         """},

      ]
    )
    return response.choices[0].message.content

6. Final function:

def generate_answer(question : str):
  #질문에 대한 키워드 찾기
  keyword = find_keyword(openai, question)

  #기사 검색하기 (네이버 api)
  response = search_article(keyword)

  #검색된 기사 임베딩
  article = make_article(response)

  #질문과 유사한 기사 찾기
  most_related_article = collect_articles(keyword, article, top_n=5)

  #질문에 답변하기
  response = generate_data(openai,most_related_article, question)

  return response

Comments

  Write a comment ...