I want to extract selected text from pdf image using pytesseract.
I have downloaded pytesseract from Tesseract at UB Mannheim and installed and ran as admin.
Then I imported the package with import pytesseract in Visual Studio Code.
However, as I ran:
from PIL import Image
import pytesseract
file = Image.open(r"C:\Users\foo\Downloads\image1.png")
text = pytesseract.image_to_string(file, lang='eng')
print(text)
it caught error:
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
254 try:
--> 255 proc = subprocess.Popen(cmd_args, **subprocess_args())
256 except OSError as e:
C:\Python38\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
853
--> 854 self._execute_child(args, executable, preexec_fn, close_fds,
855 pass_fds, cwd, env,
C:\Python38\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1306 try:
-> 1307 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1308 # no special security
FileNotFoundError: [WinError 2] The system cannot find the file specified
During handling of the above exception, another exception occurred:
TesseractNotFoundError Traceback (most recent call last)
<ipython-input-7-c8ebb3307de6> in <module>
3 # # values = Image.open( )
4 file = Image.open(r"C:\Users\Harper.Guo\Downloads\image011.png")
----> 5 text = pytesseract.image_to_string(file, lang='eng')
6
7 print(text)
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in image_to_string(image, lang, config, nice, output_type, timeout)
407 args = [image, 'txt', lang, config, nice, timeout]
408
--> 409 return {
410 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
411 Output.DICT: lambda: {'text': run_and_get_output(*args)},
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in <lambda>()
410 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
411 Output.DICT: lambda: {'text': run_and_get_output(*args)},
--> 412 Output.STRING: lambda: run_and_get_output(*args),
413 }[output_type]()
414
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_and_get_output(image, extension, lang, config, nice, timeout, return_bytes)
285 }
286
--> 287 run_tesseract(**kwargs)
288 filename = kwargs['output_filename_base'] + extsep + extension
289 with open(filename, 'rb') as output_file:
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
257 if e.errno != ENOENT:
258 raise e
--> 259 raise TesseractNotFoundError()
260
261 with timeout_manager(proc, timeout) as error_string:
TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.
Update:
Following this thread, I tried:
from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR'
file = Image.open(r'C:\Users\User\Downloads\image1.png')
text = pytesseract.image_to_string(file, lang='eng')
print(text)
which caught error:
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
<ipython-input-15-5431637f37bf> in <module>
3 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR'
4 file = Image.open(r'C:\Users\User\Downloads\image011.png')
----> 5 text = pytesseract.image_to_string(file, lang='eng')
6
7 print(text)
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in image_to_string(image, lang, config, nice, output_type, timeout)
407 args = [image, 'txt', lang, config, nice, timeout]
408
--> 409 return {
410 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
411 Output.DICT: lambda: {'text': run_and_get_output(*args)},
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in <lambda>()
410 Output.BYTES: lambda: run_and_get_output(*(args + [True])),
411 Output.DICT: lambda: {'text': run_and_get_output(*args)},
--> 412 Output.STRING: lambda: run_and_get_output(*args),
413 }[output_type]()
414
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_and_get_output(image, extension, lang, config, nice, timeout, return_bytes)
285 }
286
--> 287 run_tesseract(**kwargs)
288 filename = kwargs['output_filename_base'] + extsep + extension
289 with open(filename, 'rb') as output_file:
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
256 except OSError as e:
257 if e.errno != ENOENT:
--> 258 raise e
259 raise TesseractNotFoundError()
260
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
253
254 try:
--> 255 proc = subprocess.Popen(cmd_args, **subprocess_args())
256 except OSError as e:
257 if e.errno != ENOENT:
C:\Python38\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
852 encoding=encoding, errors=errors)
853
--> 854 self._execute_child(args, executable, preexec_fn, close_fds,
855 pass_fds, cwd, env,
856 startupinfo, creationflags, shell,
C:\Python38\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
1305 # Start the process
1306 try:
-> 1307 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
1308 # no special security
1309 None, None,
PermissionError: [WinError 5] Access is denied