OpenAI API - EasyOCR with OpenAI API and Unsplash API Side Project
Research Motivation
接續 connect OpenAI API with MATLAB Program Side Project
與 Fine-Tune OpenAI Model Side Project
並結合目前正研究的畢業專題 「基於深度學習之中文書法的辨識與生成」 產出的延伸自主研究
Introduction
利用 Unsplash API 下載取得指定類型之開源圖片 dataset
接著透過 OpenAI API model 生成 image to image 的圖像轉換 variation 變化圖、或是直接透過 OpenAI API model 生成指定 prompt 出的圖片類型 dataset
最後使用 EasyOCR 進行 text detection 提取圖像中的文字,預期以此撰寫一套簡易的文字辨識系統
架構
- 利用 Unsplash API 下載 Unsplash dataset 中指定類型的圖片(例如:車牌號碼)
- 透過 OpenAI API 將從 Unsplash 下載的圖生成 variation 變化圖
- 透過 OpenAI API prompt 生成 dataset 圖片(另種生成 dataset 的方式)
- 使用 EasyOCR 進行圖片中可辨識之文字提取
Information
1. 利用 Unsplash API 下載 Unsplash dataset 中指定類型的圖片
- 先於 Unsplash 註冊帳號成為 Developer 後進入Application 頁面創建自己的 App 後獲得 Access Key
- 我撰寫了
UnsplashDownloadImage.py
,給予 keyword 及張數來下載圖片1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37import requests
import json
import os
access_key = os.getenv('ACCESS_KEY')
base_url = 'https://api.unsplash.com/'
def download_image(url, file_path, output):
response = requests.get(url)
with open(output + '/' + file_path, 'wb') as file:
file.write(response.content)
def search_and_download(query, count, output):
search_url = base_url + 'search/photos'
headers = {'Authorization': 'Client-ID ' + access_key}
params = {'query': query, 'per_page': count}
response = requests.get(search_url, headers=headers, params=params)
data = json.loads(response.text)
for i, photo in enumerate(data['results']):
image_url = photo['urls']['regular']
file_path = f'image_{i}.jpg'
download_image(image_url, file_path, output)
def main():
query = input("Enter your search query: ")
count = int(input("Enter the number of images to download: "))
cur_dir = os.getcwd()
output = cur_dir + f'/{query}'
if not os.path.exists(output):
os.mkdir(output)
search_and_download(query, count, output)
if __name__ == '__main__':
main()
2. 透過 OpenAI API 將從 Unsplash 下載的圖生成 variation 變化圖
- 我撰寫了
ImageToImageWithOpenAI.py
將方才從 Unsplash API 下載的圖藉由 OpenAI API model 訓練並生成相似變化圖- 因為 OpenAI API 只吃
.png
,而從 Unsplash API 下載的圖片是.jpg
,所以要轉檔 - 最後輸出為生成之 variation 圖片的 URL 連結,並將其下載於指定本機資料夾中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39import openai
import os
import requests
from PIL import Image
openai.api_key = os.getenv("OPENAI_API_KEY")
# 利用 OpenAI 生成此圖片的 variation 變化圖
def generate_image_variation(image_path, n, size):
with open(image_path, "rb") as image_file:
response = openai.Image.create_variation(
image=image_file,
n=n,
size=size
)
image_url = response['data'][0]['url']
return image_url
# 將 Unsplash 下載的 jpg 圖片轉換為 png 圖片
# 因為 OpenAI API 只吃 png 圖片
jpeg_image_path = "image_250.jpg"
jpeg_image = Image.open(jpeg_image_path)
png_image_path = "image_250.png"
jpeg_image.save(png_image_path, "PNG")
# 讀取想要轉換的圖片
image_path = "image_250.png"
num_variations = 1
image_size = "1024x1024"
# 生成 variation 變化圖並獲得其 URL
generated_image_url = generate_image_variation(image_path, num_variations, image_size)
print("Generated image URL:", generated_image_url)
# 將獲得的 URL 中的圖片下載
response = requests.get(generated_image_url)
image_filename = "generated_image.png"
with open(image_filename, "wb") as file:
file.write(response.content)
- 因為 OpenAI API 只吃
3. 透過 OpenAI API prompt 生成圖片
- 先利用命令生成圖片模型
openai api image.create -p “license plate from an image with many cars”
- 目前嘗試,如果沒有這行指令執行
create.py
都會噴錯誤
- 目前嘗試,如果沒有這行指令執行
- 參考 Generate Images With DALL·E 2 and the OpenAI API 文章後撰寫
create.py
,執行後會產出含生成圖片的.json
檔並顯示圖片 URLcreate_v1.py
及create_v2.py
皆為直接輸出圖片 URL,而create_v3.py
會將圖片自動下載於responses
資料夾中1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24import json
import os
from pathlib import Path
import openai
PROMPT = "license plates with car"
data_dir = Path.cwd() / "responses"
data_dir.mkdir(exist_ok=True)
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.Image.create(
prompt=PROMPT,
n=1,
size="256x256",
response_format="b64_json",
)
file_name = data_dir / f"{PROMPT[:5]}-{response['created']}.json"
with open(file_name, mode="w", encoding="utf-8") as file:
json.dump(response, file)
- 利用
convert.py
將上一步生成的.json
檔轉換為.png
圖片1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18import json
from base64 import b64decode
from pathlib import Path
data_dir = Path.cwd() / "responses"
json_file = data_dir / "licen-1684206176.json"
image_dir = Path.cwd() / "images" / json_file.stem
image_dir.mkdir(parents=True, exist_ok=True)
with open(json_file, mode="r", encoding="utf-8") as file:
response = json.load(file)
for index, image_dict in enumerate(response["data"]):
image_data = b64decode(image_dict["b64_json"])
image_file = image_dir / f"{json_file.stem}-{index}.png"
with open(image_file, mode="wb") as png:
png.write(image_data)
4. 使用 easyOCR 進行圖片中可辨識之文字提取
- 參考 EasyOCR tutorial 文章後修改了
easyOCR.py
- 可將方才利用 OpenAI 生成之圖片
generated_image.png
中的文字進行偵測辨識讀取1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105import os
import cv2
import openai
import easyocr
import matplotlib.pyplot as plt
from torch.cuda import is_available
from dotenv import load_dotenv
class Reader:
def __init__(self, is_cuda=False):
self.reader = easyocr.Reader(['en'], gpu=is_cuda, model_storage_directory=os.path.join('models'), download_enabled=True)
def __call__(self, img):
return self.extract_text(img)
def extract_text(self, img, show_text=False, show_confidence=False):
result = self.reader.readtext(img)
extracted_text = []
for text in filter(lambda x: x[-1] > .45, result):
box, acc_text, confidence = text
# box[0] and box[2] - upper left and lower right corners of the box
img = cv2.rectangle(img, [int(i) for i in box[0]], [int(i) for i in box[2]], (0, 255, 0), 2) # each coordinate is a list has to be int
if show_text and show_confidence:
img_text = f'{acc_text} - ({"{:.3f}".format(confidence)}%)'
elif show_text:
img_text = acc_text
elif show_confidence:
img_text = f'CONF: ({"{:.3f}".format(confidence)}%)'
if show_text or show_confidence:
img = cv2.putText(
img,
img_text,
(int(box[0][0]), int(box[0][1] - 3)),
cv2.FONT_HERSHEY_SIMPLEX,
fontScale=.5,
color=(168, 90, 50),
thickness=2
)
extracted_text.append(acc_text)
return extracted_text, img
class GPT_3:
def __init__(self, api_key):
openai.api_key = api_key
self.completion = openai.Completion
self.options = {
'engine': 'text-davinci-002',
'temperature': 0.25,
'top_p': 1,
'frequency_penalty': 0,
'presence_penalty': 0,
'max_tokens': 512
}
def __call__(self, prompt, options=None):
return self.prediction(prompt, options)
def prediction(self, prompt, options=None):
if not options:
options = self.options
return self.completion.create(prompt=prompt, **options)['choices'][0]['text']
def summarize(self, text):
prompt = f'Try to summarize the following text as best you can!\n\n{text}'
return self.prediction(prompt=prompt)
def read_img(img_path):
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
if __name__ == '__main__':
load_dotenv() # Load secrets
reader = Reader(is_cuda=is_available())
gpt_3 = GPT_3(os.getenv('OPENAI_API_KEY'))
img = read_img('./generated_image.png')
text, extracted_image = reader(img)
text = ' '.join(text)
print('Extracted_text')
print(text)
summarization_result = gpt_3.summarize(text)
print('Summarized text:')
print(summarization_result)
plt.imshow(extracted_image)
plt.show()