home *** CD-ROM | disk | FTP | other *** search
- #!/usr/local/bin/wermit +
- #
- # THE FIRST LINE SHOULD INDICATE THE ACTUAL PATH OF C-KERMIT 9.0
- # and the script should be given execute permission.
- #
- # Usage: weblog filename
- #
- # Reads TSV-format web log for a bilingual English-Spanish website.
- # Extracts all Google searches.
- # Converts the coded search string into plain text.
- # If charset of search string is declared, converts it to ISO-8859-1.
- # If charset is not declared, it is tested for UTF-8 and converted.
- # Normalized search strings are tabulated using associative arrays.
- #
- # Illustrates:
- # . new \fsplit behavior - CK9.0
- # . new \fsqueeze() function
- # . message and if debug commands - New in CK9.0 Alpha.03
- # . decodehex function - New in CK9.0 Alpha.05
- # . stringtype function - New in CK9.0 Alpha.05
- # . use of MIME charset names - New in CK9.0 Alpha.05
- # . associative arrays (not new but little known)
- #
- # Frank da Cruz, Columbia University, April 2010
-
- if not def \%1 exit 1 "usage: weblog logfilename"
-
- fopen /read \%i \fcontents(\%1)
- if fail exit 1
-
- .lines = 0
- .google = 0
-
- if def \$(DEBUG) set debug message on # To print debugging messages
-
- while true {
- fread /line \%i line # Read a record
- if fail break # End of file
- incr lines # Have record - count it
- void \fsplit(\m(line),&a,\9,ALL,,1) # Split it into fields
- if not \findex(.html,\&a[5]) continue # Reject all non-HTML accesses
- .isgoogle := \findex(.google.,\&a[8]) # Reject all non-Google accesses
- if not isgoogle continue
-
- increment google # Have a Google HTML record
- void \fsplit(\&a[8],&b,&?,ALL,,1) # Split it into 'clauses'
- if debug show array b
- undef charset string # Clear result variables
- for i 1 \fdim(&b) 1 { # Loop through clauses
- void \fsplit(\&b[i],&c,=,ALL) # Split clause into ID and value
- if equ "\&c[1]" "q" .string := \&c[2] # Query string
- else if equ "\&c[1]" "ie" .charset := \&c[2] # Character set
- }
- if not def string continue # No string - skip this record
- if debug show mac charset string
-
- # Normalize the string....
-
- .string := \fsqueeze(\flower(\fdecodehex(\freplace(\m(string),+,\32))))
- if debug show mac string
- if def charset {
- _increment cset<\fupper(\m(charset))>
- if debug echo "CONVERTING [\m(string)] \m(charset)->ISO-8859-1"
- .string := \fcvtcset(\m(string),\m(charset),iso-8859-1)
- if debug show mac string
- } else if equal "\fstringtype(\m(string))" "UTF8" {
- .string := \fcvtcset(\m(string),UTF-8,iso-8859-1)
- if debug show mac string
- _increment cset<UNDECLARED_UTF-8_DETECTED>
- } else {
- _increment cset<UNDECLARED>
- }
- .string := \fsubstitute(\m(string),┴╔═╙┌▄╤,ßΘφ≤·ⁿ±)
- .string := \freplace(\m(string),espanol,espa±ol)
- .string := \freplace(\m(string),ingles,inglΘs)
- _increment search<\m(string)>
- }
- # Finished - Display statistics
-
- echo
- echo "Records: \flpad(\m(lines),8)"
- echo "Google: \flpad(\m(google),8)"
- .n := \faaconvert(search,&a,&b)
- echo "Unique: \flpad(\m(n),8)"
- .m := \faaconvert(cset,&c,&d)
- echo "Charsets: \flpad(\m(m),8)"
- echo
- echo Charsets by frequency...
- array sort /reverse /numeric &d &c
- for i 1 m 1 {
- echo \flpad(\m(i),3). \flpad([\&d[i]],8) \&c[i]
- }
- if > n 20 .n = 20
- echo
- echo Top \m(n) searches by frequency...
- array sort /reverse /numeric &b &a
- for i 1 n 1 {
- echo \flpad(\m(i),3). \flpad([\&b[i]],8) \&a[i]
- }
- exit 0
-
- ; Local Variables:
- ; comment-column:40
- ; comment-start:"# "
- ; End:
-