1234567891011121314151617181920212223242526272829303132333435363738394041
import osimport jiebaimport pytesseractimport pandas as pdfrom PIL import Imagefrom pathlib import Pathpytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'pathlist = Path("<User pytesseract Path>").glob('**/*.jpg')# config = r'-c tessedit_char_blacklist= --psm 6'prevPrompt = ''prevAns = ''df = pd.DataFrame() for fontPath in pathlist: # print(fontPath) basename = os.path.splitext(os.path.basename(fontPath))[0] print('basename: ' + basename) # if not os.path.isdir(basename): # os.mkdir(basename) img = Image.open(f'{basename}.jpg') text = pytesseract.image_to_string(img, lang='chi_tra') text = text.replace(' ', '') text = text.split('\n') # print(text) for item in text[5:15]: if item == '' : continue print(item) sentence = jieba.cut(item) sentence = (' '.join(sentence)) print(sentence)