0

I want to extract selected text from pdf image using pytesseract.

I have downloaded pytesseract from Tesseract at UB Mannheim and installed and ran as admin.

Then I imported the package with import pytesseract in Visual Studio Code.

However, as I ran:

from PIL import Image
import pytesseract
file = Image.open(r"C:\Users\foo\Downloads\image1.png")
text = pytesseract.image_to_string(file, lang='eng')
print(text)

it caught error:

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
    254     try:
--> 255         proc = subprocess.Popen(cmd_args, **subprocess_args())
    256     except OSError as e:

C:\Python38\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    853 
--> 854             self._execute_child(args, executable, preexec_fn, close_fds,
    855                                 pass_fds, cwd, env,

C:\Python38\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1306             try:
-> 1307                 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1308                                          # no special security

FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

TesseractNotFoundError                    Traceback (most recent call last)
<ipython-input-7-c8ebb3307de6> in <module>
      3 # # values  = Image.open( )
      4 file = Image.open(r"C:\Users\Harper.Guo\Downloads\image011.png")
----> 5 text = pytesseract.image_to_string(file, lang='eng')
      6 
      7 print(text)

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in image_to_string(image, lang, config, nice, output_type, timeout)
    407     args = [image, 'txt', lang, config, nice, timeout]
    408 
--> 409     return {
    410         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
    411         Output.DICT: lambda: {'text': run_and_get_output(*args)},

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in <lambda>()
    410         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
    411         Output.DICT: lambda: {'text': run_and_get_output(*args)},
--> 412         Output.STRING: lambda: run_and_get_output(*args),
    413     }[output_type]()
    414 

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_and_get_output(image, extension, lang, config, nice, timeout, return_bytes)
    285         }
    286 
--> 287         run_tesseract(**kwargs)
    288         filename = kwargs['output_filename_base'] + extsep + extension
    289         with open(filename, 'rb') as output_file:

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
    257         if e.errno != ENOENT:
    258             raise e
--> 259         raise TesseractNotFoundError()
    260 
    261     with timeout_manager(proc, timeout) as error_string:

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

Update:

Following this thread, I tried:

from PIL import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR' 

file = Image.open(r'C:\Users\User\Downloads\image1.png')
text = pytesseract.image_to_string(file, lang='eng')
print(text)

which caught error:

---------------------------------------------------------------------------
PermissionError                           Traceback (most recent call last)
<ipython-input-15-5431637f37bf> in <module>
      3 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR'
      4 file = Image.open(r'C:\Users\User\Downloads\image011.png')
----> 5 text = pytesseract.image_to_string(file, lang='eng')
      6 
      7 print(text)

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in image_to_string(image, lang, config, nice, output_type, timeout)
    407     args = [image, 'txt', lang, config, nice, timeout]
    408 
--> 409     return {
    410         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
    411         Output.DICT: lambda: {'text': run_and_get_output(*args)},

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in <lambda>()
    410         Output.BYTES: lambda: run_and_get_output(*(args + [True])),
    411         Output.DICT: lambda: {'text': run_and_get_output(*args)},
--> 412         Output.STRING: lambda: run_and_get_output(*args),
    413     }[output_type]()
    414 

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_and_get_output(image, extension, lang, config, nice, timeout, return_bytes)
    285         }
    286 
--> 287         run_tesseract(**kwargs)
    288         filename = kwargs['output_filename_base'] + extsep + extension
    289         with open(filename, 'rb') as output_file:

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
    256     except OSError as e:
    257         if e.errno != ENOENT:
--> 258             raise e
    259         raise TesseractNotFoundError()
    260 

C:\Python38\lib\site-packages\pytesseract\pytesseract.py in run_tesseract(input_filename, output_filename_base, extension, lang, config, nice, timeout)
    253 
    254     try:
--> 255         proc = subprocess.Popen(cmd_args, **subprocess_args())
    256     except OSError as e:
    257         if e.errno != ENOENT:

C:\Python38\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    852                             encoding=encoding, errors=errors)
    853 
--> 854             self._execute_child(args, executable, preexec_fn, close_fds,
    855                                 pass_fds, cwd, env,
    856                                 startupinfo, creationflags, shell,

C:\Python38\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1305             # Start the process
   1306             try:
-> 1307                 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1308                                          # no special security
   1309                                          None, None,

PermissionError: [WinError 5] Access is denied
nilsinelabore
  • 3,061
  • 10
  • 30
  • 83
  • 1
    Does this answer your question? [TesseractNotFoundError: tesseract is not installed or it's not in your path](https://stackoverflow.com/questions/51677283/tesseractnotfounderror-tesseract-is-not-installed-or-its-not-in-your-path) – Amit Gupta Jun 23 '21 at 04:50

0 Answers0