I have a code which launches selenium in multithreading mode
ThreadPool(2).map(identifier.check_type, data)
threadLocal = threading.local()
def get_driver():
driver = getattr(threadLocal, 'driver', None)
if driver is None:
chrome_options = Options()
ua = UserAgent()
userAgent = ua.random
chrome_options.add_argument(f'user-agent={userAgent}')
# chrome_options.headless = True
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source":
"const newProto = navigator.__proto__;"
"delete newProto.webdriver;"
"navigator.__proto__ = newProto;"
})
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.delete_all_cookies()
setattr(threadLocal, 'driver', driver)
return driver
class TypeIdentifier():
def __init__(self):
pass
#self.driver = self.launch_driver()
def check_type(self, input_data):
driver = get_driver()
url = input_data['start_url']
print(url)
driver.get(url)
type1 = type1_crawler.my_type(driver)
How do I keep track of driver instances and how to manage them properly so I can close any instance or pass change the url of any driver instance? Right now whenever a url failes to load or some kind of exception occur while crawling, the thread just stop responding or going for the new urls.