10 Minerals web scrapping

References:

10.1 Toolbox

from pathlib import Path
from time import perf_counter
from typing import Any
from bs4 import BeautifulSoup
from bs4.element import Tag, ResultSet
from requests.models import Response
from tinydb.database import TinyDB
from tinydb.queries import Query
from tinydb.table import Table
import re
import pandas as pd
import requests

The following provides a workaround for warnings and certificate verification:

from urllib3.exceptions import InsecureRequestWarning
import urllib3

urllib3.disable_warnings(InsecureRequestWarning)

def get(url, **kw):
    """ Wrap requests.get without SSL. """
    return requests.get(url, verify=False, **kw)

FORCE = False

10.2 Data paths

media = Path(".").resolve().parent / "data" / "thermoddem"
media.exists()

True

db_recovered = media / "sandbox-recovered.json"
db_processed = media / "sandbox-processed.json"

10.3 Path finding

URL   = "https://thermoddem.brgm.fr"
QUERY = "/data/minerals?page={n}"

def get_number_of_pages() -> int:
    """ Find number of pages in database. """
    page = get(f"{URL}{QUERY.format(n=0)}")

    soup = BeautifulSoup(page.content, "html.parser")
    pager = soup.find("ul", class_="pager")
    last = pager.find("li", class_="pager-last")
    ref = last.find("a").attrs["href"]

    if not (match := re.search(r"\?page=(\d+)", ref)):
        raise RuntimeError("Cannot extract page number from {ref}")

    return int(match.group(1))

def retrieve_navigation_table(page: Response) -> Tag:
    """ Retrieve `<table class="views-table cols-4">` from page. """
    if page.status_code != 200:
        raise RuntimeError(f"Unable to retrieve data from {page.url}!")

    soup = BeautifulSoup(page.content, "html.parser")
    tables = soup.find_all("table", class_="views-table")

    if (n_tables := len(tables)) > 1:
        print(f"Unexpected, more than 1 ({n_tables}) table!")

    tbody = tables[0].find_all("tbody")

    if (n_bodies := len(tbody)) > 1:
        print(f"Unexpected, more than 1 ({n_bodies}) bodies!")

    return tbody[0].find_all("tr")

def extract_species_paths(rows: ResultSet) -> list[str]:
    """ Find link to all species in row set. """
    paths = []

    for i, row in enumerate(rows):
        try:
            col = row.find_all("td")[0]
            ref = col.find_all("a")[-1].attrs["href"]
            paths.append(ref)
        except Exception as err:
            print(f"While parsing row {i}: {err}")

    return paths

def retrieve_all_paths(verbose: bool = False) -> list[str]:
    n_pages = get_number_of_pages()
    paths = []

    for n in range(n_pages + 1):
        if verbose and not n % 2:
            print(f"Working on page {n:02}/{n_pages}")

        page = get(f"{URL}{QUERY.format(n=n)}")
        rows = retrieve_navigation_table(page)
        paths.extend(extract_species_paths(rows))

    return paths

10.4 Data recovery

def get_thermo_row(tag: Tag) -> list[str]:
    """ Extract text data from first row of table. """
    rows = tag.find("tbody").find_all("tr")

    if (n_rows := len(rows)) > 1:
        print(f"Unexpected, more than 1 ({n_rows}) rows!")

    cols = rows[0].find_all("td")
    data = [c.text.strip() for c in cols]

    return data

def get_thermo_species(species_path: str) -> dict[str, Any]:
    """ Retrieve data from a single species URL. """
    page = get(f"{URL}{species_path}")

    if page.status_code != 200:
        raise RuntimeError(f"Unable to retrieve data from {page.url}!")

    soup = BeautifulSoup(page.content, "html.parser")

    # Header where one finds the title:
    article = soup.find("article", class_="node-mineral")

    # There are 4 `blockthermodynamics`:
    # 0 -> Molar mass and Maier-Kelley
    # 1 -> log10K and its coefficients
    # 2 -> Minitable with 298.15K, 1 bar data
    # 3 -> same as coefficients of [1]
    thermo = soup.find_all("div", class_="blockthermodynamics")

    # In thermo[0] there are 2 `table` elements:
    # 0 -> Molar mass, etc
    # 1 -> Maier-Kelley
    (mm, mk) = thermo[0].find_all("table")

    payload = {
        "path": species_path,
        "title": article.find("h2").text,
        "individual_props": get_thermo_row(mm),
        "maierkelley_props": get_thermo_row(mk)
    }

    return payload

def create_raw_database(dbname: str) -> TinyDB:
    """ Manage all required requests to construct a species database. """
    if Path(dbname).exists() and not FORCE:
        return TinyDB(dbname)

    t0 = perf_counter()
    paths = retrieve_all_paths()
    db = TinyDB(dbname)

    species = db.table("species", cache_size=len(paths))

    for i, species_path in enumerate(paths):
        try:
            data = get_thermo_species(species_path)
            species.insert(data)
        except Exception as err:
            print(f"While parsing row {i}: {err}")

    print(f"Extraction took {perf_counter()-t0:.0f} s")
    return db.table("species")

10.5 Processing

def process_title(title: str) -> str:
    """ Remove page formatting data from title. """
    return re.sub(r"(?i)^formula\s*:\s*", "", title)

def process_entry(refs: Table, track: list[str], text: str) -> float | str:
    """ Capture fields and extract numerical data for creating entry. """
    pattern = r"""
        ^\s*                             # Optional leading whitespace
        (?P<number>[-+]?\d+(?:\.\d+)?)?  # Optional signed number (int or float)
        \s*                              # Optional whitespace
        (?:                              # Non-capturing group for optional reference
            \(Ref\.\:\s*                 # Literal "(Ref.: " with optional space
            (?P<code>[^)]+)              # Reference code (anything until closing parenthesis)
            \)                           # Closing parenthesis
        )?                               # Entire reference group is optional
        \s*                              # Optional whitespace
        (?P<detail>.*)?                  # Remaining bibliographic detail (optional)
        $                                # End of line
        """
    matches = re.match(pattern, text, re.VERBOSE)

    if matches:
        if (code := matches.group("code")) is not None:
            code = code.replace(",", "|") # WIP

        if code and not refs.contains(Query().code == code):
            refs.insert({"code": code, "detail": matches.group("detail")})

        if code and code not in track:
            track.append(code)

        if (number := matches.group("number")) is None:
            return "nan"

        return float(number)

    return "nan"

def process_species(refs: Table, row: dict[str, Any]) -> dict[str, float | str]:
    """ Prepare tabular data for a given species. """
    entry = {}
    track = []

    entry["path"] = row["path"]
    entry["title"] = process_title(row["title"])

    group = row["individual_props"]
    entry["molar_mass"]          = process_entry(refs, track, group[0])
    entry["molar_volume"]        = process_entry(refs, track, group[1])
    entry["formation_gibbs"]     = process_entry(refs, track, group[2])
    entry["formation_enthalpy"]  = process_entry(refs, track, group[3])
    entry["reference_entropy"]   = process_entry(refs, track, group[4])
    entry["reference_spec_heat"] = process_entry(refs, track, group[5])

    group = row["maierkelley_props"]
    entry["coef_a"] = process_entry(refs, track, group[0])
    entry["coef_b"] = process_entry(refs, track, group[1])
    entry["coef_c"] = process_entry(refs, track, group[2])
    _ = process_entry(refs, track, group[3])

    entry["track"] = ",".join(track)
    return entry

10.6 Application

def main():
    """ Main application workflow. """
    raw = create_raw_database(db_recovered)

    db = TinyDB(db_processed)
    data = db.table("data")
    refs = db.table("references")

    for i, row in enumerate(raw.all()):
        try:
            entry = process_species(refs, row)
            data.insert(entry)
        except Exception as err:
            print(f"While parsing row {i}: {err}")

main()

Extraction took 363 s

10.7 Manual

db = TinyDB(db_processed)
data = db.table("data")
refs = db.table("references")

data

<Table name='data', total=737, storage=<tinydb.storages.JSONStorage object at 0x00000219A1D2D090>>

df = pd.DataFrame(data.all())
df.sort_values("title", inplace=True)
df.to_csv(media / "sandbox-minerals.csv", index=False)

df = pd.DataFrame(refs.all())
df.sort_values("detail", inplace=True)
df.to_csv(media / "sandbox-references.csv", index=False)