Commit | Line | Data |
---|---|---|
496af5c8 TV |
1 | #!/bin/sh |
2 | ||
3 | # Copyright (C) 2019 Free Software Foundation, Inc. | |
4 | # This program is free software; you can redistribute it and/or modify | |
5 | # it under the terms of the GNU General Public License as published by | |
6 | # the Free Software Foundation; either version 3 of the License, or | |
7 | # (at your option) any later version. | |
8 | # | |
9 | # This program is distributed in the hope that it will be useful, | |
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | # GNU General Public License for more details. | |
13 | # | |
14 | # You should have received a copy of the GNU General Public License | |
15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
16 | ||
17 | # This script intends to facilitate spell checking of comments in C sources. | |
18 | # It: | |
19 | # - extracts comments from C files | |
20 | # - transforms the comments into a list of lowercase words | |
21 | # - prefixes each word with the frequency | |
22 | # - filters out words within a frequency range | |
23 | # - sorts the words, longest first | |
24 | # | |
25 | # For: | |
26 | # ... | |
27 | # $ ./gdb/contrib/words.sh $(find gdb -type f -name "*.c" -o -name "*.h") | |
28 | # ... | |
29 | # it generates a list of ~15000 words prefixed with frequency. | |
30 | # | |
31 | # This could be used to generate a dictionary that is kept as part of the | |
32 | # sources, against which new code can be checked, generating a warning or | |
33 | # error. The hope is that misspellings would trigger this frequently, and rare | |
34 | # words rarely, otherwise the burden of updating the dictionary would be too | |
35 | # much. | |
36 | # | |
37 | # And for: | |
38 | # ... | |
39 | # $ ./gdb/contrib/words.sh -f 1 $(find gdb -type f -name "*.c" -o -name "*.h") | |
40 | # ... | |
41 | # it generates a list of ~5000 words with frequency 1. | |
42 | # | |
43 | # This can be used to scan for misspellings manually. | |
44 | # | |
45 | ||
46 | minfreq= | |
47 | maxfreq= | |
48 | while [ $# -gt 0 ]; do | |
49 | case "$1" in | |
50 | --freq|-f) | |
51 | minfreq=$2 | |
52 | maxfreq=$2 | |
53 | shift 2 | |
54 | ;; | |
55 | --min) | |
56 | minfreq=$2 | |
57 | if [ "$maxfreq" = "" ]; then | |
58 | maxfreq=0 | |
59 | fi | |
60 | shift 2 | |
61 | ;; | |
62 | --max) | |
63 | maxfreq=$2 | |
64 | if [ "$minfreq" = "" ]; then | |
65 | minfreq=0 | |
66 | fi | |
67 | shift 2 | |
68 | ;; | |
69 | *) | |
70 | break; | |
71 | ;; | |
72 | esac | |
73 | done | |
74 | ||
75 | if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then | |
76 | minfreq=0 | |
77 | maxfreq=0 | |
78 | fi | |
79 | ||
80 | awkfile=$(mktemp) | |
81 | trap 'rm -f "$awkfile"' EXIT | |
82 | ||
83 | cat > "$awkfile" <<EOF | |
84 | BEGIN { | |
85 | in_comment=0 | |
86 | } | |
87 | ||
88 | // { | |
89 | line=\$0 | |
90 | } | |
91 | ||
92 | /\/\*/ { | |
93 | in_comment=1 | |
94 | sub(/.*\/\*/, "", line) | |
95 | } | |
96 | ||
97 | /\*\// { | |
98 | sub(/\*\/.*/, "", line) | |
99 | in_comment=0 | |
100 | print line | |
101 | next | |
102 | } | |
103 | ||
104 | // { | |
105 | if (in_comment) { | |
106 | print line | |
107 | } | |
108 | } | |
109 | EOF | |
110 | ||
111 | # Stabilize sort. | |
112 | export LC_ALL=C | |
113 | ||
114 | awk \ | |
115 | -f "$awkfile" \ | |
116 | -- "$@" \ | |
117 | | sed 's/[%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \ | |
118 | | sed 's/\[/\n/g' \ | |
119 | | sed 's/\]/\n/g' \ | |
120 | | sed 's/[0-9][0-9]*/\n/g' \ | |
121 | | tr '[:upper:]' '[:lower:]' \ | |
122 | | sed 's/[ \t]*//g' \ | |
123 | | sort \ | |
124 | | uniq -c \ | |
125 | | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \ | |
126 | && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \ | |
127 | | awk '{ print length($0) " " $0; }' \ | |
128 | | sort -n -r \ | |
129 | | cut -d ' ' -f 2- |