[deliverable/binutils-gdb.git] / gdb / contrib / words.sh

#!/bin/sh

# Copyright (C) 2019 Free Software Foundation, Inc.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# This script intends to facilitate spell checking of comments in C sources.
# It:
# - extracts comments from C files
# - transforms the comments into a list of lowercase words
# - prefixes each word with the frequency
# - filters out words within a frequency range
# - sorts the words, longest first
#
# For:
# ...
# $ ./gdb/contrib/words.sh $(find gdb -type f -name "*.c" -o -name "*.h")
# ...
# it generates a list of ~15000 words prefixed with frequency.
#
# This could be used to generate a dictionary that is kept as part of the
# sources, against which new code can be checked, generating a warning or
# error.  The hope is that misspellings would trigger this frequently, and rare
# words rarely, otherwise the burden of updating the dictionary would be too
# much.
#
# And for:
# ...
# $ ./gdb/contrib/words.sh -f 1 $(find gdb -type f -name "*.c" -o -name "*.h")
# ...
# it generates a list of ~5000 words with frequency 1.
#
# This can be used to scan for misspellings manually.
#

minfreq=
maxfreq=
while [ $# -gt 0 ]; do
    case "$1" in
	--freq|-f)
	    minfreq=$2
	    maxfreq=$2
	    shift 2
	    ;;
	--min)
	    minfreq=$2
	    if [ "$maxfreq" = "" ]; then
		maxfreq=0
	    fi
	    shift 2
	    ;;
	--max)
	    maxfreq=$2
	    if [ "$minfreq" = "" ]; then
		minfreq=0
	    fi
	    shift 2
	    ;;
	*)
	    break;
	    ;;
    esac
done

if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
    minfreq=0
    maxfreq=0
fi

awkfile=$(mktemp)
trap 'rm -f "$awkfile"' EXIT

cat > "$awkfile" <<EOF
BEGIN {
    in_comment=0
}

// {
    line=\$0
}

/\/\*/ {
    in_comment=1
    sub(/.*\/\*/, "", line)
}

/\*\// {
    sub(/\*\/.*/, "", line)
    in_comment=0
    print line
    next
}

// {
    if (in_comment) {
	print line
    }
}
EOF

# Stabilize sort.
export LC_ALL=C

awk \
    -f "$awkfile" \
    -- "$@" \
    | sed 's/[%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
    | sed 's/\[/\n/g' \
    | sed 's/\]/\n/g' \
    | sed 's/[0-9][0-9]*/\n/g' \
    | tr '[:upper:]' '[:lower:]' \
    | sed 's/[ \t]*//g' \
    | sort \
    | uniq -c \
    | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
                 && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
    | awk '{ print length($0) " " $0; }' \
    | sort -n -r \
    | cut -d ' ' -f 2-
Commit	Line	Data
496af5c8 TV	1	#!/bin/sh
	2
	3	# Copyright (C) 2019 Free Software Foundation, Inc.
	4	# This program is free software; you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation; either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	# This script intends to facilitate spell checking of comments in C sources.
	18	# It:
	19	# - extracts comments from C files
	20	# - transforms the comments into a list of lowercase words
	21	# - prefixes each word with the frequency
	22	# - filters out words within a frequency range
	23	# - sorts the words, longest first
	24	#
	25	# For:
	26	# ...
	27	# $ ./gdb/contrib/words.sh $(find gdb -type f -name ".c" -o -name ".h")
	28	# ...
	29	# it generates a list of ~15000 words prefixed with frequency.
	30	#
	31	# This could be used to generate a dictionary that is kept as part of the
	32	# sources, against which new code can be checked, generating a warning or
	33	# error. The hope is that misspellings would trigger this frequently, and rare
	34	# words rarely, otherwise the burden of updating the dictionary would be too
	35	# much.
	36	#
	37	# And for:
	38	# ...
	39	# $ ./gdb/contrib/words.sh -f 1 $(find gdb -type f -name ".c" -o -name ".h")
	40	# ...
	41	# it generates a list of ~5000 words with frequency 1.
	42	#
	43	# This can be used to scan for misspellings manually.
	44	#
	45
	46	minfreq=
	47	maxfreq=
	48	while [ $# -gt 0 ]; do
	49	case "$1" in
	50	--freq\|-f)
	51	minfreq=$2
	52	maxfreq=$2
	53	shift 2
	54	;;
	55	--min)
	56	minfreq=$2
	57	if [ "$maxfreq" = "" ]; then
	58	maxfreq=0
	59	fi
	60	shift 2
	61	;;
	62	--max)
	63	maxfreq=$2
	64	if [ "$minfreq" = "" ]; then
65	minfreq=0
66	fi
67	shift 2
68	;;
69	*)
70	break;
71	;;
72	esac
73	done
74
75	if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
76	minfreq=0
77	maxfreq=0
78	fi
79
80	awkfile=$(mktemp)
81	trap 'rm -f "$awkfile"' EXIT
82
83	cat > "$awkfile" <<EOF
84	BEGIN {
85	in_comment=0
86	}
87
88	// {
89	line=\$0
90	}
91
92	/\/\*/ {
93	in_comment=1
94	sub(/.\/\/, "", line)
95	}
96
97	/\*\// {
98	sub(/\\/./, "", line)
99	in_comment=0
100	print line
101	next
102	}
103
104	// {
105	if (in_comment) {
106	print line
107	}
108	}
109	EOF
110
111	# Stabilize sort.
112	export LC_ALL=C
113
114	awk \
115	-f "$awkfile" \
116	-- "$@" \
117	\| sed 's/[%^$~#{}`&=@,. \t\/_()\|<>\+\*-]/\n/g' \
118	\| sed 's/\[/\n/g' \
119	\| sed 's/\]/\n/g' \
120	\| sed 's/[0-9][0-9]*/\n/g' \
121	\| tr '[:upper:]' '[:lower:]' \
122	\| sed 's/[ \t]*//g' \
123	\| sort \
124	\| uniq -c \
125	\| awk "{ if (($minfreq == 0 \|\| $minfreq <= \$1) \
126	&& ($maxfreq == 0 \|\| \$1 <= $maxfreq)) { print \$0; } }" \
127	\| awk '{ print length($0) " " $0; }' \
128	\| sort -n -r \
129	\| cut -d ' ' -f 2-