#!/bin/bash

FAQ_SRC_BASE_DIR=source
LINKCHECK_LOG_TXT=build/linkcheck/output.txt
OBSOLETE_LINKS_LOG=nouveaux_liens_obsoletes
MISSING_LINKS_START_PAGES_LOG=nouveaux_liens_manquants
OUTPUT_PREFIX=patch-correction-liens

LOG_HEADER='
============
'"$(date +%F)"'
============
'

CUSTOMIZED_START_PREAMBLE='```{toctree}
:glob: true
:maxdepth: 1
:hidden:

*/start
*
```'

OBSOLETE_LINK_MARKUP='{sup}`lien obsolète`'

ADMONITION_MISSING_LINKS=':::{todo}
Les liens qui suivent ne sont pas classés.
:::'

ADMONITION_OBSOLETE_LINKS=':::{todo}
Cette page a des liens obsolètes.
:::'

confirmation() {
    echo '
Ce script vérifie les liens internes et externes dans toute la FAQ.
Il marque les liens brisés comme obsolètes
et corrige automatiquement les liens redirigés.
Pour les liens externes, il s’appuie sur la dernière sortie de linkcheck,
qui s’invoque comme suit : `make linkcheck`.
  NOTE : linkcheck prend beaucoup de temps.
  La sortie sur le terminal peut s’interrompre occasionnellement :
  c’est normal tant que l’interruption ne dépasse pas dix minutes.

À la fin du processus, toutes les modifications
effectuées par le présent script sur les fichiers `.md`
vont être annulées. À la place, vous obtiendrez un patch
que vous pourrez ensuite inspecter et corriger
avant de l’appliquer à votre copie de la FAQ
et d’enregistrer les modifications.
'
    sleep 1
    if [[ "$(grep -F 'Changes not staged for commit:' <<< "$(git status)")" ]] ; then
        echo 'ATTENTION : Votre copie de la FAQ contient des modifications
que vous n’avez pas encore enregistrées avec `git commit`.
Veuillez les valider (`git commit`) ou les annuler (`git stash`)
avant de relancer le script. Voici la liste des fichiers concernés :
'
        sleep 1
        echo "$(git status --untracked-files=no |
                           sed '/^no changes added to commit/d' |
                           sed 's/^/  /')"
    exit 1
    fi
}
        

warning() {
    echo '
Vérification des liens en cours…
'
}


make_links_list_for_section(){
    curr_dir="$1"
    links="$(find "$curr_dir" -maxdepth 2 -path "$curr_dir"'/*/start.md')"
    links="$links
$(find "$curr_dir" -maxdepth 1 -type f -name '*.md' -not -name '*start.md')"
    links="$(echo "$links" |
                  sed 's/^'"$FAQ_SRC_BASE_DIR"'//' |
                  sed '/^$/d' |
                  # Supprimer `.md`, qui ne se trouve pas toujours dans la cible du lien.
                  sed 's/\.md$//')"
    echo "$links"
}

get_missing_links(){
    missing_links=''
    curr_sphinx_file="$1"
    while read -r link ; do
        if [[ ! "$(grep "($link)" "$curr_sphinx_file")" ]] ; then
            missing_links="$missing_links"'-   []('"$link"')
'
        fi
    done <<< $(make_links_list_for_section "${curr_sphinx_file%/*}")
    echo "$missing_links"
}

internal_link_exists() {
    link="$1"
    link="${link%#*}" # Remove references to anchors
    if [[ ! "$(grep -F '.' <<< "$link")" ]] ; then
        link="$link".md
    fi
    ls "$FAQ_SRC_BASE_DIR""$link" 2>/dev/null
}

escape_for_regexp() {
    sed -E 's/([][?.*+{}()])/\\\1/g' <<< "$1"
}

change_link() {
    # Changes the link if it is not embedded in another link (e.g. in the Wayback Machine).
    # Limitation: operates only on the first occurrence of $link in the line.
    operation="$1"
    link="$2"
    str="$3" # new link for 'replace' or markup for 'mark'
    filepath="$4"
    line="$5" # optional: line number
    markedup_link="$(sed -n "$line"'p' "$filepath" | grep -o -E '[<(`]?'"$(escape_for_regexp "$link")"'[>)`]?' | head -n 1)"
    # The line number given by linkcheck may be wrong.
    # In this case, the function is called again on the whole file.
    # If the link fails to be found again, skip.
    # (linkcheck sometimes identifies strings as links erroneously,
    # e.g. *Developpez.net* in source/1_generalites/documentation/sites/forums.md)
    if [[ ! "$markedup_link" && "$line" ]] ; then
        change_link "$operation" "$link" "$str" "$filepath"
    elif [[ "$(is_whole_link "$markedup_link")" ]] ; then
        case "$operation" in
            'replace') sed -i -E "$line"'s^'"$(escape_for_regexp "$link")"'^'"$str"'^' "$filepath";;
            'mark') sed -i -E "$line"'s^'"$(escape_for_regexp "$markedup_link")"'^'"$markedup_link""$str"'^' "$filepath";;
        esac
        echo true
    fi
}

mark_new_obsolete_link() {
    link="$1"
    file="$2"
    line="$3" # optional
    signaled_link_regexp="$(escape_for_regexp "$link")"'([)>])?'"$(escape_for_regexp "$OBSOLETE_LINK_MARKUP")"
    if [[ ! "$(grep -E "$signaled_link_regexp" "$file")" &&
              "$(change_link 'mark' "$link" "$OBSOLETE_LINK_MARKUP" "$file" "$line")" ]] ; then
        echo true
    fi
}

signal_obsolete_links(){
    curr_sphinx_file="$1"
    obsolete_links=''
    while read -r link ; do
        if [[ "$link" && ! "$(internal_link_exists "$link")" &&
                  "$(mark_new_obsolete_link "$link" "$curr_sphinx_file")" ]] ; then
            obsolete_links=true
        fi
    done <<< $(grep -o -E '\]\(/[1-8]_[^[:space:]]+\)' "$curr_sphinx_file" |
                   sort | uniq |
                   cut -f 2 -d '(' | cut -f 1 -d ')')
    echo "$obsolete_links"
}

add_missing_admonition_obsolete_links(){
    file="$1"
    if [[ ! "$(grep -F "$(sed -n 2p <<< "$ADMONITION_OBSOLETE_LINKS")" "$file")" ]] ; then
        echo "$file" >> "$OBSOLETE_LINKS_LOG"
        admo_missing_links_ln="$(("$(grep -n -F "$(sed -n 2p <<< "$ADMONITION_MISSING_LINKS")" "$file" | cut -d ':' -f 1 | head -n 1)" - 1))"
        if [[ "$admo_missing_links_ln" -gt -1 ]] ; then
            for n in $(seq 3 -1 1) ; do
                sed -i "$(($admo_missing_links_ln - 2))"'a '"$(sed -n ${n}p <<< "$ADMONITION_OBSOLETE_LINKS")" "$file"
            done
            sed -i "$(($admo_missing_links_ln - 4))"'a ''
' "$file"
        else
            echo '' >> "$file"
            echo "$ADMONITION_OBSOLETE_LINKS" >> "$file"
            echo '' >> "$file"
        fi
    fi
}

is_whole_link() {
    markedup_link="$1"
    first="${markedup_link:0:1}"
    last="${markedup_link: -1:1}"
    if [[ "$markedup_link" &&
              "$first" != '`' && "$last" != '`' &&
              ( ( "$first" == '<' && "$last" == '>' ) ||
                    ( "$first" == '(' && "$last" == ')' ) ||
                    ( "$first" != '<' && "$last" != '>' &&
                          "$first" != '(' && "$last" != ')' ) ) ]] ; then
        echo true
    fi
}

check_missing_links_in_start_pages() {
    while read -r start_file ; do
        if [[ "$(grep -F ':hidden:' "$start_file")" ]] ; then
            missing_links="$(get_missing_links "$start_file")"
            if [[ "$missing_links" ]] ; then
                if [[ ! "$(grep -F "$(sed -n 2p <<< "$ADMONITION_MISSING_LINKS")" "$start_file")" ]] ; then
                    echo "$start_file" >> "$MISSING_LINKS_START_PAGES_LOG"
                    echo '' >> "$start_file"
                    echo "$ADMONITION_MISSING_LINKS" >> "$start_file"
                    echo '' >> "$start_file"
                fi
                echo "$missing_links" >> "$start_file"
            fi
        fi
    done  <<< $(find "$FAQ_SRC_BASE_DIR" -type f -name 'start.md')
}

check_obsolete_internal_links() {
    while read -r file ; do
        if [[ "$(signal_obsolete_links "$file")" ]] ; then
            add_missing_admonition_obsolete_links "$file"
        fi
    done <<< $(find "$FAQ_SRC_BASE_DIR" -type f -name '*.md' \
                    -not -path "${FAQ_SRC_BASE_DIR}/8_contribuer/syntax.md" \
                    -not -path "${FAQ_SRC_BASE_DIR}/8_contribuer/antiseche.md" \
                    -not -path "${FAQ_SRC_BASE_DIR}/8_contribuer/questions_a_reviser.md" \
                    -not -path "${FAQ_SRC_BASE_DIR}/8_contribuer/pages_a_traduire.md" \
                    -not -path "${FAQ_SRC_BASE_DIR}/7_cette_faq/questions_les_plus_frequentes.md")
}

shortcut_faq() {
    case "$1" in
        https://ctan.org/pkg/*) echo ctanpkg:"${1#https://ctan.org/pkg/}";;
        https://texdoc.net/pkg/*) echo texdoc:"${1#https://texdoc.net/pkg/}";;
        https://texfaq.org/*) echo faquk:"${1#https://texfaq.org/}";;
        https://fr.wikipedia.org/wiki/*) echo wpfr:"${1#https://fr.wikipedia.org/wiki/}";;
        https://en.wikipedia.org/wiki/*) echo wp:"${1#https://en.wikipedia.org/wiki/}";;
        https://isbndb.com/book/*) echo isbn:"${1#https://isbndb.com/book/}";;
        https://doi.org/*) echo doi:"${1#https://doi.org/}";;
        *) ;;
    esac
}

mark_broken_external_link() {
    link="$1"
    filepath="$2"
    line="$3"
    if [[ "$(mark_new_obsolete_link "$link" "$filepath" "$line")" ||
              ( "$(shortcut_faq "$link")" &&
                    "$(mark_new_obsolete_link "$(shortcut_faq "$link")" "$filepath" "$line")" ) ]] ; then
        add_missing_admonition_obsolete_links "$filepath"
    fi
}

is_not_in_broken_whitelist() {
    grep -v -F -e 'texdoc.net' \
         -e 'https://latex.org/forum' \
         -e 'https://texblog.net' \
         -e 'https://texample.net' \
         <<< "$1"
}

is_not_in_redirect_whitelist() {
    grep -v -F -e 'mirrors.ctan.org' \
         -e 'www.youtube.com' \
         -e 'doi.org' \
         -e 'stackexchange.com' \
         -e 'stackoverflow.com' \
         -e 'gitlab.gutenberg-asso.fr' \
         <<< "$1"
}

apply_linkcheck_log() {
    while read -r logline ; do
        infos="$(sed -E 's/^([^:]+):([0-9]+): \[([a-zA-Z ]+)\] ([^ ]+[^: ]):? .*$/\1\t\2\t\3\t\4/' <<< "$logline")"
        filepath="$FAQ_SRC_BASE_DIR"/"$(cut -f 1 <<< "$infos")"
        line="$(cut -f 2 <<< "$infos")"
        diagn="$(cut -f 3 <<< "$infos")"
        link="$(cut -f 4 <<< "$infos")"
        case "$diagn" in
            'broken')
                if [[ "$(is_not_in_broken_whitelist "$link")" ]] ; then
                    mark_broken_external_link "$link" "$filepath" "$line"
                fi;;
            'redirected with Found' | 'redirected permanently')
                if [[ "$(is_not_in_redirect_whitelist "$link")" ]] ; then
                    newlink="$(sed -E 's/^.+ to ([^[:space:]]+)$/\1/' <<< "$logline")"
                    # Parfois, la redirection se fait vers la page d'accueil du nouveau site
                    # plutôt que vers le nouvel emplacement de la page visée.
                    # Dans ce cas, il vaut mieux déclarer que le lien est cassé.
                    # On détermine si on est sur une page d'accueil en comptant
                    # le nombre de barres dans l'URL, barre finale exclue.
                    if [[ "$(grep -o '/' <<< "${newlink%/}" | wc -l)" -lt 3 &&
                              "$(grep -o '/' <<< "${link%/}" | wc -l)" -gt 2 ]] ; then
                        mark_broken_external_link "$link" "$filepath" "$line"
                    else
                        change_link 'replace' "$link" "$newlink" "$filepath" "$line" > /dev/null
                    fi
                fi;;
        esac
    done <<< $(grep -F -e '[broken]' -e '[redirected' "$LINKCHECK_LOG_TXT" |
                   grep -v -F -e '403 Client Error: Forbidden for url')
}

make_patch() {
    git diff --patch --output="$1" && git stash
    echo 'Le patch est dans le fichier '"$1"
    echo 'Après l’avoir vérifié et éventuellement corrigé,'
    echo 'vous pourrez l’appliquer avec la commande'
    echo '`git apply '"$1"'`,'
    echo 'puis enregistrer les modifications avec `git commit`.'
}

######## main ########

confirmation

output_file="$OUTPUT_PREFIX"_"$(date '+%y-%m-%d_%H-%M')".diff

warning "$output_file"

echo "$LOG_HEADER" >> "$OBSOLETE_LINKS_LOG"
echo "$LOG_HEADER" >> "$MISSING_LINKS_START_PAGES_LOG"

check_obsolete_internal_links
check_missing_links_in_start_pages
apply_linkcheck_log

make_patch "$output_file"
