The most effective way is to use CAPTCHA solving service and residential proxies which is fast and reliable.
If you don't want to figure out how to use CAPTCHA or figure out which proxies to use, you can try Google Scholar API from SerpApi which is a paid API with a free plan that bypasses blocks on the backend.
Code and example in the online IDE to scrape publications from all available pages with the ability to save results to CSV:
import pandas as pd
import os, json
from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl
def serpapi_scrape_all_publications(query: str):
params = {
"api_key": os.getenv("API_KEY"), # your SerpApi API key
"engine": "google_scholar", # search engine
"hl": "en", # language
"q": query, # search query
"num": "100" # articles per page
}
# where data extraction happens on SerpApi backend.
search = GoogleScholarSearch(params)
publications = []
publications_is_present = True
while publications_is_present:
results = search.get_dict() # JSON -> Python dictionary
for publication in results.get("organic_results", {}):
publications.append({
"title": publication.get("title"),
"link": publication.get("link"),
"result_id": publication.get("result_id"),
"snippet": publication.get("snippet"),
"inline_links": publication.get("inline_links"),
"publication_info": publication.get("publication_info")
})
# checks for the next page and updates if present
if "next" in results.get("serpapi_pagination", []):
# split URL in parts as a dict() and update "search" variable to a new page
search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
else:
publications_is_present = False
print(json.dumps(publications, indent=2, ensure_ascii=False))
serpapi_scrape_all_publications(query="biology")
Outputs:
[
{
"title": "Fungal decomposition of wood: its biology and ecology",
"link": null,
"result_id": "LiWKgtH72owJ",
"snippet": "",
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=LiWKgtH72owJ",
"cited_by": {
"total": 1446,
"link": "https://scholar.google.com/scholar?cites=10149701587489662254&as_sdt=400005&sciodt=0,14&hl=en&num=20",
"cites_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=400005&cites=10149701587489662254&engine=google_scholar&hl=en&num=20"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:LiWKgtH72owJ:scholar.google.com/&scioq=biology&hl=en&num=20&as_sdt=0,14",
"versions": {
"total": 6,
"link": "https://scholar.google.com/scholar?cluster=10149701587489662254&hl=en&num=20&as_sdt=0,14",
"cluster_id": "10149701587489662254",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C14&cluster=10149701587489662254&engine=google_scholar&hl=en&num=20"
}
},
"publication_info": {
"summary": "ADM Rayner, L Boddy - 1988"
}
}, ... other results
]
Disclaimer, I work for SerpApi.