From: Josh Soref <jsoref@users.noreply.github.com>
Date: Sun, 6 Jan 2019 03:05:49 +0000 (-0500)
Subject: initial spelling checker implementation
X-Git-Tag: rec-4.2.0-alpha1~36^2~1
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fd416518db058685b7ba55cec40eff5f93907ff0;p=pdns

initial spelling checker implementation
---

diff --git a/build-scripts/jsoref-spellchecker/exclude.pl b/build-scripts/jsoref-spellchecker/exclude.pl
new file mode 100755
index 000000000..afb4900e6
--- /dev/null
+++ b/build-scripts/jsoref-spellchecker/exclude.pl
@@ -0,0 +1,14 @@
+#!/usr/bin/perl
+# This script takes null delimited files as input
+# it drops paths that match the listed exclusions
+# output is null delimited to match input
+$/="\0";
+my @excludes=qw(
+  spellchecker
+);
+my $exclude = join "|", @excludes;
+while (<>) {
+  chomp;
+  next if m{$exclude};
+  print "$_$/";
+}
diff --git a/build-scripts/jsoref-spellchecker/rpm2cpio.sh b/build-scripts/jsoref-spellchecker/rpm2cpio.sh
new file mode 100755
index 000000000..57d291d2a
--- /dev/null
+++ b/build-scripts/jsoref-spellchecker/rpm2cpio.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+pkg=$1
+if [ "$pkg" = "" -o ! -e "$pkg" ]; then
+    echo "no package supplied" 1>&2
+    exit 1
+fi
+
+leadsize=96
+o=`expr $leadsize + 8`
+set `od -j $o -N 8 -t u1 $pkg`
+il=`expr 256 \* \( 256 \* \( 256 \* $2 + $3 \) + $4 \) + $5`
+dl=`expr 256 \* \( 256 \* \( 256 \* $6 + $7 \) + $8 \) + $9`
+# echo "sig il: $il dl: $dl"
+
+sigsize=`expr 8 + 16 \* $il + $dl`
+o=`expr $o + $sigsize + \( 8 - \( $sigsize \% 8 \) \) \% 8 + 8`
+set `od -j $o -N 8 -t u1 $pkg`
+il=`expr 256 \* \( 256 \* \( 256 \* $2 + $3 \) + $4 \) + $5`
+dl=`expr 256 \* \( 256 \* \( 256 \* $6 + $7 \) + $8 \) + $9`
+# echo "hdr il: $il dl: $dl"
+
+hdrsize=`expr 8 + 16 \* $il + $dl`
+o=`expr $o + $hdrsize`
+EXTRACTOR="dd if=$pkg ibs=$o skip=1"
+
+COMPRESSION=`($EXTRACTOR |file -) 2>/dev/null`
+if echo $COMPRESSION |grep -q gzip; then
+        DECOMPRESSOR=gunzip
+elif echo $COMPRESSION |grep -q bzip2; then
+        DECOMPRESSOR=bunzip2
+elif echo $COMPRESSION |grep -iq xz; then # xz and XZ safe
+        DECOMPRESSOR=unxz
+elif echo $COMPRESSION |grep -q cpio; then
+        DECOMPRESSOR=cat
+else
+        # Most versions of file don't support LZMA, therefore we assume
+        # anything not detected is LZMA
+        DECOMPRESSOR=`which unlzma 2>/dev/null`
+        case "$DECOMPRESSOR" in
+            /* ) ;;
+            *  ) DECOMPRESSOR=`which lzmash 2>/dev/null`
+             case "$DECOMPRESSOR" in
+                     /* ) DECOMPRESSOR="lzmash -d -c" ;;
+                     *  ) DECOMPRESSOR=cat ;;
+                 esac
+                 ;;
+        esac
+fi
+
+$EXTRACTOR 2>/dev/null | $DECOMPRESSOR
diff --git a/build-scripts/test-spelling-unknown-words b/build-scripts/test-spelling-unknown-words
new file mode 100755
index 000000000..21ee82294
--- /dev/null
+++ b/build-scripts/test-spelling-unknown-words
@@ -0,0 +1,118 @@
+#!/bin/bash
+# This CI acceptance test is based on:
+# https://github.com/jsoref/spelling/tree/04648bdc63723e5cdf5cbeaff2225a462807abc8
+# It is conceptually `f` which runs `w` (spelling-unknown-word-splitter)
+# plus `fchurn` which uses `dn` mostly rolled together.
+set -e
+
+spellchecker='build-scripts/jsoref-spellchecker'
+temp='../.ci-temp'
+whitelist_path="$spellchecker/whitelist.words"
+dict="$temp/english.words"
+word_splitter="$spellchecker/spelling-unknown-word-splitter.pl"
+run_output="$spellchecker/unknown.words"
+if [ ! -e "$dict" ]; then
+  mkdir -p "$temp"
+  echo "Retrieve ./usr/share/dict/linux.words"
+  words_rpm="$temp/words.rpm"
+  mirror="https://rpmfind.net"
+  file_path="/linux/fedora/linux/development/rawhide/Everything/aarch64/os/Packages/w/"
+  location="${mirror}${file_path}"
+  file_name="$(curl -s "$location" | grep -o "words-.*.noarch.rpm" || echo "")"
+  if [ -z "$file_name" ]; then
+    echo "$0 failed to retrieve url for words package from $location"
+    exit 3
+  fi
+  location="${mirror}${file_path}${file_name}"
+  curl "$location" -o "$words_rpm"
+  if ! "$spellchecker/rpm2cpio.sh" "$words_rpm" |\
+    perl -e '$/="\0"; while (<>) {if (/^0707/) { $state = (m!\./usr/share/dict/linux.words!) }
+      elsif ($state == 1) { print }} '\
+    > "$dict"; then
+    rpm_extract_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}"
+    rm -f "$words_rpm" "$dict"
+    echo "$0 failed to extract words ($location as $words_rpm) ($rpm_extract_status)"
+    exit 4
+  fi
+  rpm_extract_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}"
+  if [ "$rpm_extract_status" != '0 0' ]; then
+    echo "$0 failed to extract words ($location as $words_rpm) ($rpm_extract_status)"
+    rm -f "$words_rpm" "$dict"
+    exit 5
+  fi
+  rm "$words_rpm"
+fi
+
+if [ ! -e "$word_splitter" ]; then
+  echo "Retrieve w"
+  w_location='https://raw.githubusercontent.com/jsoref/spelling/master/w'
+  curl -s "$w_location" |\
+    perl -p -n -e "s</usr/share/dict/words><$dict>" > "$word_splitter"
+  get_word_splitter_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}"
+  if [ "$get_word_splitter_status" != '0 0' ]; then
+    echo "$0 failed to retrieve/adapt word splitter ($w_location) ($get_word_splitter_status)"
+    rm -f "$word_splitter"
+    exit 6
+  fi
+  chmod u+x "$word_splitter"
+  echo "Retrieved."
+  ls -la "$word_splitter"
+fi
+
+echo "Clean up from previous run"
+rm -f "$run_output"
+
+echo "Run w"
+(git 'ls-files' -z 2> /dev/null || hg locate -0) |\
+  "$spellchecker/exclude.pl" |\
+  xargs -0 "$word_splitter" |\
+  "$word_splitter" |\
+  perl -p -n -e 's/ \(.*//' > "$run_output"
+  word_splitter_status="${PIPESTATUS[2]} ${PIPESTATUS[3]}"
+  if [ "$word_splitter_status" != '0 0' ]; then
+    echo "$word_splitter failed ($word_splitter_status)"
+    exit 2
+  fi
+
+printDetails() {
+  echo ''
+  echo 'If you are ok with the output of this run, you will need to'
+}
+
+echo "Review results"
+if [ ! -e "$whitelist_path" ]; then
+  echo "No preexisting $whitelist_path file."
+  printDetails
+  echo 'cat > '"$whitelist_path"' <<EOF=EOF'
+  cat "$run_output"
+  echo EOF=EOF
+  exit 2
+fi
+
+diff_output=$(diff -U1 "$whitelist_path" "$run_output" |grep -v "$spellchecker" || true)
+
+if [ -z "$diff_output" ]; then
+  echo "No new words and misspellings found."
+  exit 0
+fi
+
+new_output=$(diff -i -U0 "$whitelist_path" "$run_output" |grep -v "$spellchecker" |\
+  perl -n -w -e 'next unless /^\+/; next if /^\+{3} /; s/^.//; print;')
+if [ -z "$new_output" ]; then
+  echo "There are now fewer misspellings than before."
+  echo "$whitelist_path could be updated:"
+  echo ''
+  echo "patch '$whitelist_path' <<EOF"
+  echo "$diff_output"
+  echo "EOF"
+  sleep 5
+  exit 1
+fi
+echo "New misspellings found, please review:"
+echo "$new_output"
+printDetails
+echo "patch $whitelist_path <<EOF"
+echo "$diff_output"
+echo "EOF"
+sleep 5
+exit 1