OpenAI API - EasyOCR with OpenAI API and Unsplash API Side Project

OpenAI API - EasyOCR with OpenAI API and Unsplash API Side Project

LAVI

Research Motivation

接續 connect OpenAI API with MATLAB Program Side Project
Fine-Tune OpenAI Model Side Project
並結合目前正研究的畢業專題 「基於深度學習之中文書法的辨識與生成」 產出的延伸自主研究

Introduction

利用 Unsplash API 下載取得指定類型之開源圖片 dataset
接著透過 OpenAI API model 生成 image to image 的圖像轉換 variation 變化圖、或是直接透過 OpenAI API model 生成指定 prompt 出的圖片類型 dataset
最後使用 EasyOCR 進行 text detection 提取圖像中的文字,預期以此撰寫一套簡易的文字辨識系統

LAVI's EasyOCR with OpenAI API and Unsplash API Side Project

架構

  1. 利用 Unsplash API 下載 Unsplash dataset 中指定類型的圖片(例如:車牌號碼)
  2. 透過 OpenAI API 將從 Unsplash 下載的圖生成 variation 變化圖
  3. 透過 OpenAI API prompt 生成 dataset 圖片(另種生成 dataset 的方式)
  4. 使用 EasyOCR 進行圖片中可辨識之文字提取

Information

1. 利用 Unsplash API 下載 Unsplash dataset 中指定類型的圖片

  • 先於 Unsplash 註冊帳號成為 Developer 後進入Application 頁面創建自己的 App 後獲得 Access Key
  • 我撰寫了 UnsplashDownloadImage.py,給予 keyword 及張數來下載圖片
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    import requests
    import json
    import os

    access_key = os.getenv('ACCESS_KEY')
    base_url = 'https://api.unsplash.com/'

    def download_image(url, file_path, output):
    response = requests.get(url)
    with open(output + '/' + file_path, 'wb') as file:
    file.write(response.content)

    def search_and_download(query, count, output):
    search_url = base_url + 'search/photos'
    headers = {'Authorization': 'Client-ID ' + access_key}
    params = {'query': query, 'per_page': count}
    response = requests.get(search_url, headers=headers, params=params)
    data = json.loads(response.text)

    for i, photo in enumerate(data['results']):
    image_url = photo['urls']['regular']
    file_path = f'image_{i}.jpg'
    download_image(image_url, file_path, output)

    def main():
    query = input("Enter your search query: ")
    count = int(input("Enter the number of images to download: "))

    cur_dir = os.getcwd()
    output = cur_dir + f'/{query}'
    if not os.path.exists(output):
    os.mkdir(output)

    search_and_download(query, count, output)

    if __name__ == '__main__':
    main()

2. 透過 OpenAI API 將從 Unsplash 下載的圖生成 variation 變化圖

  • 我撰寫了 ImageToImageWithOpenAI.py 將方才從 Unsplash API 下載的圖藉由 OpenAI API model 訓練並生成相似變化圖
    • 因為 OpenAI API 只吃 .png,而從 Unsplash API 下載的圖片是 .jpg,所以要轉檔
    • 最後輸出為生成之 variation 圖片的 URL 連結,並將其下載於指定本機資料夾中
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      import openai
      import os
      import requests
      from PIL import Image

      openai.api_key = os.getenv("OPENAI_API_KEY")

      # 利用 OpenAI 生成此圖片的 variation 變化圖
      def generate_image_variation(image_path, n, size):
      with open(image_path, "rb") as image_file:
      response = openai.Image.create_variation(
      image=image_file,
      n=n,
      size=size
      )
      image_url = response['data'][0]['url']
      return image_url

      # 將 Unsplash 下載的 jpg 圖片轉換為 png 圖片
      # 因為 OpenAI API 只吃 png 圖片
      jpeg_image_path = "image_250.jpg"
      jpeg_image = Image.open(jpeg_image_path)
      png_image_path = "image_250.png"
      jpeg_image.save(png_image_path, "PNG")

      # 讀取想要轉換的圖片
      image_path = "image_250.png"
      num_variations = 1
      image_size = "1024x1024"

      # 生成 variation 變化圖並獲得其 URL
      generated_image_url = generate_image_variation(image_path, num_variations, image_size)
      print("Generated image URL:", generated_image_url)

      # 將獲得的 URL 中的圖片下載
      response = requests.get(generated_image_url)
      image_filename = "generated_image.png"
      with open(image_filename, "wb") as file:
      file.write(response.content)

3. 透過 OpenAI API prompt 生成圖片

  • 先利用命令生成圖片模型 openai api image.create -p “license plate from an image with many cars”
    • 目前嘗試,如果沒有這行指令執行 create.py 都會噴錯誤
  • 參考 Generate Images With DALL·E 2 and the OpenAI API 文章後撰寫 create.py ,執行後會產出含生成圖片的 .json 檔並顯示圖片 URL
    • create_v1.pycreate_v2.py 皆為直接輸出圖片 URL,而 create_v3.py 會將圖片自動下載於 responses 資料夾中
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      import json
      import os
      from pathlib import Path

      import openai

      PROMPT = "license plates with car"
      data_dir = Path.cwd() / "responses"

      data_dir.mkdir(exist_ok=True)

      openai.api_key = os.getenv("OPENAI_API_KEY")

      response = openai.Image.create(
      prompt=PROMPT,
      n=1,
      size="256x256",
      response_format="b64_json",
      )

      file_name = data_dir / f"{PROMPT[:5]}-{response['created']}.json"

      with open(file_name, mode="w", encoding="utf-8") as file:
      json.dump(response, file)
  • 利用 convert.py 將上一步生成的 .json 檔轉換為 .png 圖片
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    import json
    from base64 import b64decode
    from pathlib import Path

    data_dir = Path.cwd() / "responses"
    json_file = data_dir / "licen-1684206176.json"
    image_dir = Path.cwd() / "images" / json_file.stem

    image_dir.mkdir(parents=True, exist_ok=True)

    with open(json_file, mode="r", encoding="utf-8") as file:
    response = json.load(file)

    for index, image_dict in enumerate(response["data"]):
    image_data = b64decode(image_dict["b64_json"])
    image_file = image_dir / f"{json_file.stem}-{index}.png"
    with open(image_file, mode="wb") as png:
    png.write(image_data)

4. 使用 easyOCR 進行圖片中可辨識之文字提取

  • 參考 EasyOCR tutorial 文章後修改了 easyOCR.py
  • 可將方才利用 OpenAI 生成之圖片 generated_image.png 中的文字進行偵測辨識讀取
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    import os
    import cv2
    import openai
    import easyocr
    import matplotlib.pyplot as plt
    from torch.cuda import is_available
    from dotenv import load_dotenv

    class Reader:
    def __init__(self, is_cuda=False):
    self.reader = easyocr.Reader(['en'], gpu=is_cuda, model_storage_directory=os.path.join('models'), download_enabled=True)

    def __call__(self, img):
    return self.extract_text(img)

    def extract_text(self, img, show_text=False, show_confidence=False):
    result = self.reader.readtext(img)

    extracted_text = []

    for text in filter(lambda x: x[-1] > .45, result):
    box, acc_text, confidence = text

    # box[0] and box[2] - upper left and lower right corners of the box
    img = cv2.rectangle(img, [int(i) for i in box[0]], [int(i) for i in box[2]], (0, 255, 0), 2) # each coordinate is a list has to be int

    if show_text and show_confidence:
    img_text = f'{acc_text} - ({"{:.3f}".format(confidence)}%)'

    elif show_text:
    img_text = acc_text

    elif show_confidence:
    img_text = f'CONF: ({"{:.3f}".format(confidence)}%)'

    if show_text or show_confidence:
    img = cv2.putText(
    img,
    img_text,
    (int(box[0][0]), int(box[0][1] - 3)),
    cv2.FONT_HERSHEY_SIMPLEX,
    fontScale=.5,
    color=(168, 90, 50),
    thickness=2
    )

    extracted_text.append(acc_text)

    return extracted_text, img

    class GPT_3:
    def __init__(self, api_key):
    openai.api_key = api_key

    self.completion = openai.Completion
    self.options = {
    'engine': 'text-davinci-002',
    'temperature': 0.25,
    'top_p': 1,
    'frequency_penalty': 0,
    'presence_penalty': 0,
    'max_tokens': 512
    }

    def __call__(self, prompt, options=None):
    return self.prediction(prompt, options)

    def prediction(self, prompt, options=None):
    if not options:
    options = self.options

    return self.completion.create(prompt=prompt, **options)['choices'][0]['text']

    def summarize(self, text):
    prompt = f'Try to summarize the following text as best you can!\n\n{text}'

    return self.prediction(prompt=prompt)

    def read_img(img_path):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    return img

    if __name__ == '__main__':
    load_dotenv() # Load secrets

    reader = Reader(is_cuda=is_available())
    gpt_3 = GPT_3(os.getenv('OPENAI_API_KEY'))

    img = read_img('./generated_image.png')
    text, extracted_image = reader(img)

    text = ' '.join(text)

    print('Extracted_text')
    print(text)

    summarization_result = gpt_3.summarize(text)

    print('Summarized text:')
    print(summarization_result)

    plt.imshow(extracted_image)
    plt.show()

成果

此為原圖 此為 OpenAI model 生成之 variation 圖 經由 EasyOCR 進行圖片中可辨識之文字提取,如此圖中,綠色框中代表此處文字有被偵測成功 ,此圖範例中被成功辨識出的車牌文字為 01、891、811

Reference