[deliverable/binutils-gdb.git] / gdb / contrib / words.sh

#!/bin/sh

# Copyright (C) 2019 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# This script intends to facilitate spell checking of source/doc files.
# It:
# - transforms the files into a list of lowercase words
# - prefixes each word with the frequency
# - filters out words within a frequency range
# - sorts the words, longest first
#
# If '-c' is passed as option, it operates on the C comments only, rather than
# on the entire file.
#
# For:
# ...
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
# $ ./gdb/contrib/words.sh -c $files
# ...
# it generates a list of ~15000 words prefixed with frequency.
#
# This could be used to generate a dictionary that is kept as part of the
# sources, against which new code can be checked, generating a warning or
# error.  The hope is that misspellings would trigger this frequently, and rare
# words rarely, otherwise the burden of updating the dictionary would be too
# much.
#
# And for:
# ...
# $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
# $ ./gdb/contrib/words.sh -c -f 1 $files
# ...
# it generates a list of ~5000 words with frequency 1.
#
# This can be used to scan for misspellings manually.
#

minfreq=
maxfreq=
c=false
while [ $# -gt 0 ]; do
    case "$1" in
	-c)
	    c=true
	    shift
	    ;;
	--freq|-f)
	    minfreq=$2
	    maxfreq=$2
	    shift 2
	    ;;
	--min)
	    minfreq=$2
	    if [ "$maxfreq" = "" ]; then
		maxfreq=0
	    fi
	    shift 2
	    ;;
	--max)
	    maxfreq=$2
	    if [ "$minfreq" = "" ]; then
		minfreq=0
	    fi
	    shift 2
	    ;;
	*)
	    break;
	    ;;
    esac
done

if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
    minfreq=0
    maxfreq=0
fi

awkfile=$(mktemp)
trap 'rm -f "$awkfile"' EXIT

cat > "$awkfile" <<EOF
BEGIN {
    in_comment=0
}

// {
    line=\$0
}

/\/\*/ {
    in_comment=1
    sub(/.*\/\*/, "", line)
}

/\*\// {
    sub(/\*\/.*/, "", line)
    in_comment=0
    print line
    next
}

// {
    if (in_comment) {
	print line
    }
}
EOF

# Stabilize sort.
export LC_ALL=C

if $c; then
    awk \
	-f "$awkfile" \
	-- "$@"
else
    cat "$@"
fi \
    | sed \
	  -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
	  -e 's/\[/\n/g' \
	  -e 's/\]/\n/g' \
	  -e "s/'/\n/g" \
	  -e 's/[0-9][0-9]*/\n/g' \
	  -e 's/[ \t]*//g' \
    | tr '[:upper:]' '[:lower:]' \
    | sort \
    | uniq -c \
    | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
                 && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
    | awk '{ print length($0) " " $0; }' \
    | sort -n -r \
    | cut -d ' ' -f 2-
Commit	Line	Data
	1	#!/bin/sh
	2
	3	# Copyright (C) 2019 Free Software Foundation, Inc.
	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	# This script intends to facilitate spell checking of source/doc files.
	18	# It:
	19	# - transforms the files into a list of lowercase words
	20	# - prefixes each word with the frequency
	21	# - filters out words within a frequency range
	22	# - sorts the words, longest first
	23	#
	24	# If '-c' is passed as option, it operates on the C comments only, rather than
	25	# on the entire file.
	26	#
	27	# For:
	28	# ...
	29	# $ files=$(find gdb -type f -name ".c" -o -name ".h")
	30	# $ ./gdb/contrib/words.sh -c $files
	31	# ...
	32	# it generates a list of ~15000 words prefixed with frequency.
	33	#
	34	# This could be used to generate a dictionary that is kept as part of the
	35	# sources, against which new code can be checked, generating a warning or
	36	# error. The hope is that misspellings would trigger this frequently, and rare
	37	# words rarely, otherwise the burden of updating the dictionary would be too
	38	# much.
	39	#
	40	# And for:
	41	# ...
	42	# $ files=$(find gdb -type f -name ".c" -o -name ".h")
	43	# $ ./gdb/contrib/words.sh -c -f 1 $files
	44	# ...
	45	# it generates a list of ~5000 words with frequency 1.
	46	#
	47	# This can be used to scan for misspellings manually.
	48	#
	49
	50	minfreq=
	51	maxfreq=
	52	c=false
	53	while [ $# -gt 0 ]; do
	54	case "$1" in
	55	-c)
	56	c=true
	57	shift
	58	;;
	59	--freq\|-f)
	60	minfreq=$2
	61	maxfreq=$2
	62	shift 2
	63	;;
	64	--min)
	65	minfreq=$2
	66	if [ "$maxfreq" = "" ]; then
	67	maxfreq=0
	68	fi
	69	shift 2
	70	;;
	71	--max)
	72	maxfreq=$2
	73	if [ "$minfreq" = "" ]; then
	74	minfreq=0
	75	fi
	76	shift 2
	77	;;
	78	*)
	79	break;
	80	;;
	81	esac
	82	done
	83
	84	if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
	85	minfreq=0
	86	maxfreq=0
	87	fi
	88
	89	awkfile=$(mktemp)
	90	trap 'rm -f "$awkfile"' EXIT
	91
	92	cat > "$awkfile" <<EOF
	93	BEGIN {
	94	in_comment=0
	95	}
	96
	97	// {
	98	line=\$0
	99	}
	100
	101	/\/\*/ {
	102	in_comment=1
	103	sub(/.\/\/, "", line)
	104	}
	105
	106	/\*\// {
	107	sub(/\\/./, "", line)
	108	in_comment=0
	109	print line
	110	next
	111	}
	112
	113	// {
	114	if (in_comment) {
	115	print line
	116	}
	117	}
	118	EOF
	119
	120	# Stabilize sort.
	121	export LC_ALL=C
	122
	123	if $c; then
	124	awk \
	125	-f "$awkfile" \
	126	-- "$@"
	127	else
	128	cat "$@"
	129	fi \
	130	\| sed \
	131	-e 's/[!"?;:%^$~#{}`&=@,. \t\/_()\|<>\+\*-]/\n/g' \
	132	-e 's/\[/\n/g' \
	133	-e 's/\]/\n/g' \
	134	-e "s/'/\n/g" \
	135	-e 's/[0-9][0-9]*/\n/g' \
	136	-e 's/[ \t]*//g' \
	137	\| tr '[:upper:]' '[:lower:]' \
	138	\| sort \
	139	\| uniq -c \
	140	\| awk "{ if (($minfreq == 0 \|\| $minfreq <= \$1) \
	141	&& ($maxfreq == 0 \|\| \$1 <= $maxfreq)) { print \$0; } }" \
	142	\| awk '{ print length($0) " " $0; }' \
	143	\| sort -n -r \
	144	\| cut -d ' ' -f 2-