from pathlib import Path
from time import perf_counter
from typing import Any
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
from requests.models import Response
from tinydb.database import TinyDB
from tinydb.queries import Query
from tinydb.table import Table
import re
import pandas as pd
import requests10 Minerals web scrapping
References:
10.1 Toolbox
The following provides a workaround for warnings and certificate verification:
from urllib3.exceptions import InsecureRequestWarning
import urllib3
urllib3.disable_warnings(InsecureRequestWarning)def get(url, **kw):
""" Wrap requests.get without SSL. """
return requests.get(url, verify=False, **kw)FORCE = False10.2 Data paths
media = Path(".").resolve().parent / "data" / "thermoddem"
media.exists()True
db_recovered = media / "sandbox-recovered.json"
db_processed = media / "sandbox-processed.json"10.3 Path finding
URL = "https://thermoddem.brgm.fr"
QUERY = "/data/minerals?page={n}"def get_number_of_pages() -> int:
""" Find number of pages in database. """
page = get(f"{URL}{QUERY.format(n=0)}")
soup = BeautifulSoup(page.content, "html.parser")
pager = soup.find("ul", class_="pager")
last = pager.find("li", class_="pager-last")
ref = last.find("a").attrs["href"]
if not (match := re.search(r"\?page=(\d+)", ref)):
raise RuntimeError("Cannot extract page number from {ref}")
return int(match.group(1))def retrieve_navigation_table(page: Response) -> Tag:
""" Retrieve `<table class="views-table cols-4">` from page. """
if page.status_code != 200:
raise RuntimeError(f"Unable to retrieve data from {page.url}!")
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table", class_="views-table")
if (n_tables := len(tables)) > 1:
print(f"Unexpected, more than 1 ({n_tables}) table!")
tbody = tables[0].find_all("tbody")
if (n_bodies := len(tbody)) > 1:
print(f"Unexpected, more than 1 ({n_bodies}) bodies!")
return tbody[0].find_all("tr")def extract_species_paths(rows: ResultSet) -> list[str]:
""" Find link to all species in row set. """
paths = []
for i, row in enumerate(rows):
try:
col = row.find_all("td")[0]
ref = col.find_all("a")[-1].attrs["href"]
paths.append(ref)
except Exception as err:
print(f"While parsing row {i}: {err}")
return pathsdef retrieve_all_paths(verbose: bool = False) -> list[str]:
n_pages = get_number_of_pages()
paths = []
for n in range(n_pages + 1):
if verbose and not n % 2:
print(f"Working on page {n:02}/{n_pages}")
page = get(f"{URL}{QUERY.format(n=n)}")
rows = retrieve_navigation_table(page)
paths.extend(extract_species_paths(rows))
return paths10.4 Data recovery
def get_thermo_row(tag: Tag) -> list[str]:
""" Extract text data from first row of table. """
rows = tag.find("tbody").find_all("tr")
if (n_rows := len(rows)) > 1:
print(f"Unexpected, more than 1 ({n_rows}) rows!")
cols = rows[0].find_all("td")
data = [c.text.strip() for c in cols]
return datadef get_thermo_species(species_path: str) -> dict[str, Any]:
""" Retrieve data from a single species URL. """
page = get(f"{URL}{species_path}")
if page.status_code != 200:
raise RuntimeError(f"Unable to retrieve data from {page.url}!")
soup = BeautifulSoup(page.content, "html.parser")
# Header where one finds the title:
article = soup.find("article", class_="node-mineral")
# There are 4 `blockthermodynamics`:
# 0 -> Molar mass and Maier-Kelley
# 1 -> log10K and its coefficients
# 2 -> Minitable with 298.15K, 1 bar data
# 3 -> same as coefficients of [1]
thermo = soup.find_all("div", class_="blockthermodynamics")
# In thermo[0] there are 2 `table` elements:
# 0 -> Molar mass, etc
# 1 -> Maier-Kelley
(mm, mk) = thermo[0].find_all("table")
payload = {
"path": species_path,
"title": article.find("h2").text,
"individual_props": get_thermo_row(mm),
"maierkelley_props": get_thermo_row(mk)
}
return payloaddef create_raw_database(dbname: str) -> TinyDB:
""" Manage all required requests to construct a species database. """
if Path(dbname).exists() and not FORCE:
return TinyDB(dbname)
t0 = perf_counter()
paths = retrieve_all_paths()
db = TinyDB(dbname)
species = db.table("species", cache_size=len(paths))
for i, species_path in enumerate(paths):
try:
data = get_thermo_species(species_path)
species.insert(data)
except Exception as err:
print(f"While parsing row {i}: {err}")
print(f"Extraction took {perf_counter()-t0:.0f} s")
return db.table("species")10.5 Processing
def process_title(title: str) -> str:
""" Remove page formatting data from title. """
return re.sub(r"(?i)^formula\s*:\s*", "", title)def process_entry(refs: Table, track: list[str], text: str) -> float | str:
""" Capture fields and extract numerical data for creating entry. """
pattern = r"""
^\s* # Optional leading whitespace
(?P<number>[-+]?\d+(?:\.\d+)?)? # Optional signed number (int or float)
\s* # Optional whitespace
(?: # Non-capturing group for optional reference
\(Ref\.\:\s* # Literal "(Ref.: " with optional space
(?P<code>[^)]+) # Reference code (anything until closing parenthesis)
\) # Closing parenthesis
)? # Entire reference group is optional
\s* # Optional whitespace
(?P<detail>.*)? # Remaining bibliographic detail (optional)
$ # End of line
"""
matches = re.match(pattern, text, re.VERBOSE)
if matches:
if (code := matches.group("code")) is not None:
code = code.replace(",", "|") # WIP
if code and not refs.contains(Query().code == code):
refs.insert({"code": code, "detail": matches.group("detail")})
if code and code not in track:
track.append(code)
if (number := matches.group("number")) is None:
return "nan"
return float(number)
return "nan"def process_species(refs: Table, row: dict[str, Any]) -> dict[str, float | str]:
""" Prepare tabular data for a given species. """
entry = {}
track = []
entry["path"] = row["path"]
entry["title"] = process_title(row["title"])
group = row["individual_props"]
entry["molar_mass"] = process_entry(refs, track, group[0])
entry["molar_volume"] = process_entry(refs, track, group[1])
entry["formation_gibbs"] = process_entry(refs, track, group[2])
entry["formation_enthalpy"] = process_entry(refs, track, group[3])
entry["reference_entropy"] = process_entry(refs, track, group[4])
entry["reference_spec_heat"] = process_entry(refs, track, group[5])
group = row["maierkelley_props"]
entry["coef_a"] = process_entry(refs, track, group[0])
entry["coef_b"] = process_entry(refs, track, group[1])
entry["coef_c"] = process_entry(refs, track, group[2])
_ = process_entry(refs, track, group[3])
entry["track"] = ",".join(track)
return entry10.6 Application
def main():
""" Main application workflow. """
raw = create_raw_database(db_recovered)
db = TinyDB(db_processed)
data = db.table("data")
refs = db.table("references")
for i, row in enumerate(raw.all()):
try:
entry = process_species(refs, row)
data.insert(entry)
except Exception as err:
print(f"While parsing row {i}: {err}")main()Extraction took 363 s
10.7 Manual
db = TinyDB(db_processed)
data = db.table("data")
refs = db.table("references")data<Table name='data', total=737, storage=<tinydb.storages.JSONStorage object at 0x00000219A1D2D090>>
df = pd.DataFrame(data.all())
df.sort_values("title", inplace=True)
df.to_csv(media / "sandbox-minerals.csv", index=False)df = pd.DataFrame(refs.all())
df.sort_values("detail", inplace=True)
df.to_csv(media / "sandbox-references.csv", index=False)