From: Josh Soref Date: Sun, 6 Jan 2019 03:05:49 +0000 (-0500) Subject: initial spelling checker implementation X-Git-Tag: rec-4.2.0-alpha1~36^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fd416518db058685b7ba55cec40eff5f93907ff0;p=pdns initial spelling checker implementation --- diff --git a/build-scripts/jsoref-spellchecker/exclude.pl b/build-scripts/jsoref-spellchecker/exclude.pl new file mode 100755 index 000000000..afb4900e6 --- /dev/null +++ b/build-scripts/jsoref-spellchecker/exclude.pl @@ -0,0 +1,14 @@ +#!/usr/bin/perl +# This script takes null delimited files as input +# it drops paths that match the listed exclusions +# output is null delimited to match input +$/="\0"; +my @excludes=qw( + spellchecker +); +my $exclude = join "|", @excludes; +while (<>) { + chomp; + next if m{$exclude}; + print "$_$/"; +} diff --git a/build-scripts/jsoref-spellchecker/rpm2cpio.sh b/build-scripts/jsoref-spellchecker/rpm2cpio.sh new file mode 100755 index 000000000..57d291d2a --- /dev/null +++ b/build-scripts/jsoref-spellchecker/rpm2cpio.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +pkg=$1 +if [ "$pkg" = "" -o ! -e "$pkg" ]; then + echo "no package supplied" 1>&2 + exit 1 +fi + +leadsize=96 +o=`expr $leadsize + 8` +set `od -j $o -N 8 -t u1 $pkg` +il=`expr 256 \* \( 256 \* \( 256 \* $2 + $3 \) + $4 \) + $5` +dl=`expr 256 \* \( 256 \* \( 256 \* $6 + $7 \) + $8 \) + $9` +# echo "sig il: $il dl: $dl" + +sigsize=`expr 8 + 16 \* $il + $dl` +o=`expr $o + $sigsize + \( 8 - \( $sigsize \% 8 \) \) \% 8 + 8` +set `od -j $o -N 8 -t u1 $pkg` +il=`expr 256 \* \( 256 \* \( 256 \* $2 + $3 \) + $4 \) + $5` +dl=`expr 256 \* \( 256 \* \( 256 \* $6 + $7 \) + $8 \) + $9` +# echo "hdr il: $il dl: $dl" + +hdrsize=`expr 8 + 16 \* $il + $dl` +o=`expr $o + $hdrsize` +EXTRACTOR="dd if=$pkg ibs=$o skip=1" + +COMPRESSION=`($EXTRACTOR |file -) 2>/dev/null` +if echo $COMPRESSION |grep -q gzip; then + DECOMPRESSOR=gunzip +elif echo $COMPRESSION |grep -q bzip2; then + DECOMPRESSOR=bunzip2 +elif echo $COMPRESSION |grep -iq xz; then # xz and XZ safe + DECOMPRESSOR=unxz +elif echo $COMPRESSION |grep -q cpio; then + DECOMPRESSOR=cat +else + # Most versions of file don't support LZMA, therefore we assume + # anything not detected is LZMA + DECOMPRESSOR=`which unlzma 2>/dev/null` + case "$DECOMPRESSOR" in + /* ) ;; + * ) DECOMPRESSOR=`which lzmash 2>/dev/null` + case "$DECOMPRESSOR" in + /* ) DECOMPRESSOR="lzmash -d -c" ;; + * ) DECOMPRESSOR=cat ;; + esac + ;; + esac +fi + +$EXTRACTOR 2>/dev/null | $DECOMPRESSOR diff --git a/build-scripts/test-spelling-unknown-words b/build-scripts/test-spelling-unknown-words new file mode 100755 index 000000000..21ee82294 --- /dev/null +++ b/build-scripts/test-spelling-unknown-words @@ -0,0 +1,118 @@ +#!/bin/bash +# This CI acceptance test is based on: +# https://github.com/jsoref/spelling/tree/04648bdc63723e5cdf5cbeaff2225a462807abc8 +# It is conceptually `f` which runs `w` (spelling-unknown-word-splitter) +# plus `fchurn` which uses `dn` mostly rolled together. +set -e + +spellchecker='build-scripts/jsoref-spellchecker' +temp='../.ci-temp' +whitelist_path="$spellchecker/whitelist.words" +dict="$temp/english.words" +word_splitter="$spellchecker/spelling-unknown-word-splitter.pl" +run_output="$spellchecker/unknown.words" +if [ ! -e "$dict" ]; then + mkdir -p "$temp" + echo "Retrieve ./usr/share/dict/linux.words" + words_rpm="$temp/words.rpm" + mirror="https://rpmfind.net" + file_path="/linux/fedora/linux/development/rawhide/Everything/aarch64/os/Packages/w/" + location="${mirror}${file_path}" + file_name="$(curl -s "$location" | grep -o "words-.*.noarch.rpm" || echo "")" + if [ -z "$file_name" ]; then + echo "$0 failed to retrieve url for words package from $location" + exit 3 + fi + location="${mirror}${file_path}${file_name}" + curl "$location" -o "$words_rpm" + if ! "$spellchecker/rpm2cpio.sh" "$words_rpm" |\ + perl -e '$/="\0"; while (<>) {if (/^0707/) { $state = (m!\./usr/share/dict/linux.words!) } + elsif ($state == 1) { print }} '\ + > "$dict"; then + rpm_extract_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}" + rm -f "$words_rpm" "$dict" + echo "$0 failed to extract words ($location as $words_rpm) ($rpm_extract_status)" + exit 4 + fi + rpm_extract_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}" + if [ "$rpm_extract_status" != '0 0' ]; then + echo "$0 failed to extract words ($location as $words_rpm) ($rpm_extract_status)" + rm -f "$words_rpm" "$dict" + exit 5 + fi + rm "$words_rpm" +fi + +if [ ! -e "$word_splitter" ]; then + echo "Retrieve w" + w_location='https://raw.githubusercontent.com/jsoref/spelling/master/w' + curl -s "$w_location" |\ + perl -p -n -e "s<$dict>" > "$word_splitter" + get_word_splitter_status="${PIPESTATUS[0]} ${PIPESTATUS[1]}" + if [ "$get_word_splitter_status" != '0 0' ]; then + echo "$0 failed to retrieve/adapt word splitter ($w_location) ($get_word_splitter_status)" + rm -f "$word_splitter" + exit 6 + fi + chmod u+x "$word_splitter" + echo "Retrieved." + ls -la "$word_splitter" +fi + +echo "Clean up from previous run" +rm -f "$run_output" + +echo "Run w" +(git 'ls-files' -z 2> /dev/null || hg locate -0) |\ + "$spellchecker/exclude.pl" |\ + xargs -0 "$word_splitter" |\ + "$word_splitter" |\ + perl -p -n -e 's/ \(.*//' > "$run_output" + word_splitter_status="${PIPESTATUS[2]} ${PIPESTATUS[3]}" + if [ "$word_splitter_status" != '0 0' ]; then + echo "$word_splitter failed ($word_splitter_status)" + exit 2 + fi + +printDetails() { + echo '' + echo 'If you are ok with the output of this run, you will need to' +} + +echo "Review results" +if [ ! -e "$whitelist_path" ]; then + echo "No preexisting $whitelist_path file." + printDetails + echo 'cat > '"$whitelist_path"' <