home *** CD-ROM | disk | FTP | other *** search
- # uri.tcl --
- #
- # URI parsing and fetch
- #
- # Copyright (c) 2000 Zveno Pty Ltd
- # Steve Ball, http://www.zveno.com/
- # Derived from urls.tcl by Andreas Kupries
- #
- # TODO:
- # Handle www-url-encoding details
- #
- # CVS: $Id: uri.tcl,v 1.7 2001/08/02 16:38:07 andreas_kupries Exp $
-
- package require Tcl 8.2
- package provide uri 1.0
-
- namespace eval uri {
-
- namespace export split join
- namespace export resolve isrelative
- namespace export geturl
- namespace export canonicalize
-
- variable file:counter 0
-
- # Currently known URL schemes:
- #
- # (RFC 1738)
- # ------------------------------------------------
- # scheme basic syntax of scheme specific part
- # ------------------------------------------------
- # ftp //<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
- #
- # http //<host>:<port>/<path>?<searchpart>
- #
- # gopher //<host>:<port>/<gophertype><selector>
- # <gophertype><selector>%09<search>
- # <gophertype><selector>%09<search>%09<gopher+_string>
- #
- # mailto <rfc822-addr-spec>
- # news <newsgroup-name>
- # <message-id>
- # nntp //<host>:<port>/<newsgroup-name>/<article-number>
- # telnet //<user>:<password>@<host>:<port>/
- # wais //<host>:<port>/<database>
- # //<host>:<port>/<database>?<search>
- # //<host>:<port>/<database>/<wtype>/<wpath>
- # file //<host>/<path>
- # prospero //<host>:<port>/<hsoname>;<field>=<value>
- # ------------------------------------------------
- #
- # (RFC 2111)
- # ------------------------------------------------
- # scheme basic syntax of scheme specific part
- # ------------------------------------------------
- # mid message-id
- # message-id/content-id
- # cid content-id
- # ------------------------------------------------
-
- # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- # basic regular expressions used in URL syntax.
-
- namespace eval basic {
- variable loAlpha {[a-z]}
- variable hiAlpha {[A-Z]}
- variable digit {[0-9]}
- variable alpha {[a-zA-Z]}
- variable safe {[$_.+-]}
- variable extra {[!*'(,)]}
- # danger in next pattern, order important for []
- variable national {[][|\}\{\^~`]}
- variable punctuation {[<>#%"]} ;#" fake emacs hilit
- variable reserved {[;/?:@&=]}
- variable hex {[0-9A-Fa-f]}
- variable alphaDigit {[A-Za-z0-9]}
- variable alphaDigitMinus {[A-Za-z0-9-]}
-
- # next is <national | punctuation>
- variable unsafe {[][<>"#%\{\}|\\^~`]} ;#" emacs hilit
- variable escape "%${hex}${hex}"
-
- # unreserved = alpha | digit | safe | extra
- # xchar = unreserved | reserved | escape
-
- variable unreserved {[a-zA-Z0-9$_.+!*'(,)-]}
- variable uChar "(${unreserved}|${escape})"
- variable xCharN {[a-zA-Z0-9$_.+!*'(,);/?:@&=-]}
- variable xChar "(${xCharN}|${escape})"
- variable digits "${digit}+"
-
- variable toplabel \
- "(${alpha}${alphaDigitMinus}*${alphaDigit}|${alpha})"
- variable domainlabel \
- "(${alphaDigit}${alphaDigitMinus}*${alphaDigit}|${alphaDigit})"
-
- variable hostname \
- "((${domainlabel}\\.)*${toplabel})"
- variable hostnumber \
- "(${digits}\\.${digits}\\.${digits}\\.${digits})"
-
- variable host "(${hostname}|${hostnumber})"
-
- variable port $digits
- variable hostOrPort "${host}(:${port})?"
-
- variable usrCharN {[a-zA-Z0-9$_.+!*'(,);?&=-]}
- variable usrChar "(${usrCharN}|${escape})"
- variable user "${usrChar}*"
- variable password $user
- variable login "(${user}(:${password})?@)?${hostOrPort}"
- } ;# basic {}
-
- # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- # regular expressions covering the various url schemes
-
- # extend this variable in the coming namespaces
- variable schemes {}
-
- # FTP
- namespace eval ftp {
- set escape $[namespace current]::basic::escape
- set login $[namespace current]::basic::login
-
- variable charN {[a-zA-Z0-9$_.+!*'(,)?:@&=-]}
- variable char "(${charN}|${escape})"
- variable segment "${char}*"
- variable path "${segment}(/${segment})*"
-
- variable type {[AaDdIi]}
- variable typepart ";type=${type}"
- variable schemepart \
- "//${login}(/${path}(${typepart})?)?"
-
- variable url "ftp:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes ftp
- }
-
- # FILE
- namespace eval file {
- set host $[namespace current]::basic::host
- set path $[namespace current]::ftp::path
-
- variable schemepart "//(${host}|localhost)?/${path}"
- variable url "file:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes file
- }
-
- # HTTP
- namespace eval http {
- set escape $[namespace current]::basic::escape
- set hostOrPort $[namespace current]basic::hostOrPort
-
- variable charN {[a-zA-Z0-9$_.+!*'(,);:@&=-]}
- variable char "($charN|${escape})"
- variable segment "${char}*"
-
- variable path "${segment}(/${segment})*"
- variable search $segment
- variable schemepart \
- "//${hostOrPort}(/${path}(\\?${search})?)?"
-
- variable url "http:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes http
- }
-
- # GOPHER
- namespace eval gopher {
- set xChar $[namespace current]::basic::xChar
- set hostOrPort $[namespace current]::basic::hostOrPort
- set search $[namespace current]::http::search
-
- variable type $xChar
- variable selector "$xChar*"
- variable string $selector
- variable schemepart \
- "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
- variable url "gopher:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes gopher
- }
-
- # MAILTO
- namespace eval mailto {
- set xChar $[namespace current]::basic::xChar
- set host $[namespace current]::basic::host
-
- variable schemepart "$xChar+(@${host})?"
- variable url "mailto:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes mailto
- }
-
- # NEWS
- namespace eval news {
- set escape $[namespace current]::basic::escape
- set alpha $[namespace current]::basic::alpha
- set host $[namespace current]::basic::host
-
- variable aCharN {[a-zA-Z0-9$_.+!*'(,);/?:&=-]}
- variable aChar "($aCharN|${escape})"
- variable gChar {[a-zA-Z0-9$_.+-]}
- variable group "${alpha}${gChar}*"
- variable article "${aChar}+@${host}"
- variable schemepart "\\*|${group}|${article}"
- variable url "news:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes news
- }
-
- # WAIS
- namespace eval wais {
-
- set uChar $[namespace current]::basic::xChar
- set hostOrPort $[namespace current]::basic::hostOrPort
- set search $[namespace current]::http::search
-
- variable db "${uChar}*"
- variable type "${uChar}*"
- variable path "${uChar}*"
-
- variable database "//${hostOrPort}/${db}"
- variable index "//${hostOrPort}/${db}\\?${search}"
- variable doc "//${hostOrPort}/${db}/${type}/${path}"
-
-
- #variable schemepart "${doc}|${index}|${database}"
-
- variable schemepart \
- "//${hostOrPort}/${db}((\\?${search})|(/${type}/${path}))?"
-
- variable url "wais:${schemepart}"
- # FRINK: nocheck
- lappend [namespace current]::schemes wais
- }
-
- # PROSPERO
- namespace eval prospero {
- set escape $[namespace current]::basic::escape
- set hostOrPort $[namespace current]::basic::hostOrPort
- set path $[namespace current]::ftp::path
-
- variable charN {[a-zA-Z0-9$_.+!*'(,)?:@&-]}
- variable char "(${charN}|$escape)"
-
- variable fieldname "${char}*"
- variable fieldvalue "${char}*"
- variable fieldspec ";${fieldname}=${fieldvalue}"
-
- variable schemepart "//${hostOrPort}/${path}(${fieldspec})*"
- variable url "prospero:$schemepart"
- # FRINK: nocheck
- lappend [namespace current]::schemes prospero
- }
-
- # ------------------------------------------------------
- # combine all together, self adjusting code, no need for maintenance
- # after adding more schemes
-
- variable schemePattern "([::join $schemes |]):"
- variable url ""
- variable url2part
-
- foreach scheme $schemes {
- # FRINK: nocheck
- append url "(${scheme}:[set ${scheme}::schemepart])|"
- # FRINK: nocheck
- set url2part($scheme) "${scheme}:[set ${scheme}::schemepart]"
- }
- set url [string trimright $url |]
- catch {unset scheme}
-
- # ------------------------------------------------------
-
- }
-
- # uri::split --
- #
- # Splits the given <a url> into its constituents.
- #
- # Arguments:
- # url the URL to split
- #
- # Results:
- # Tcl list containing constituents, suitable for 'array set'.
-
- proc uri::split {url} {
-
- set url [string trim $url]
- set scheme {}
-
- # RFC 1738: scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
- regexp -- {^([a-z0-9+-.][a-z0-9+-.]*):} $url dummy scheme
-
- if {$scheme == {}} {
- set scheme http
- }
-
- # ease maintenance: dynamic dispatch, able to handle all schemes
- # added in future!
-
- if {[::info procs Split[string totitle $scheme]] == {}} {
- error "unknown scheme '$scheme' in '$url'"
- }
-
- regsub -- "^${scheme}:" $url {} url
-
- set parts(scheme) $scheme
- array set parts [Split[string totitle $scheme] $url]
-
- # should decode all encoded characters!
-
- return [array get parts]
- }
-
- proc uri::SplitFtp {url} {
- # @c Splits the given ftp-<a url> into its constituents.
- # @a url: The url to split, without! scheme specification.
- # @r List containing the constituents, suitable for 'array set'.
-
- # general syntax:
- # //<user>:<password>@<host>:<port>/<cwd1>/.../<cwdN>/<name>;type=<typecode>
- #
- # additional rules:
- #
- # <user>:<password> are optional, detectable by presence of @.
- # <password> is optional too.
- #
- # "//" [ <user> [":" <password> ] "@"] <host> [":" <port>] "/"
- # <cwd1> "/" ..."/" <cwdN> "/" <name> [";type=" <typecode>]
-
- upvar #0 [namespace current]::ftp::typepart ftptype
-
- array set parts {user {} pwd {} host {} port {} path {} type {}}
-
- # slash off possible type specification
-
- if {[regexp -indices -- "${ftptype}$" $url dummy ftype]} {
- set from [lindex $ftype 0]
- set to [lindex $ftype 1]
-
- set parts(type) [string range $url $from $to]
-
- set from [lindex $dummy 0]
- set url [string replace $url $from end]
- }
-
- # Handle user, password, host and port
-
- if {[string match "//*" $url]} {
- set url [string range $url 2 end]
-
- array set parts [GetUPHP url]
- }
-
- set parts(path) [string trimleft $url /]
-
- return [array get parts]
- }
-
- proc uri::JoinFtp args {
- array set components {
- user {} password {} host {} port {}
- path {} type {}
- }
- array set components $args
-
- set userPwd {}
- if {[string length $components(user)] || [string length $components(password)]} {
- set userPwd $components(user)[expr {[string length $components(password)] ? ":$components(password)" : {}}]@
- }
-
- set port {}
- if {[string length $components(port)]} {
- set port :$components(port)
- }
-
- set type {}
- if {[string length $components(type)]} {
- set type \;$components(type)
- }
-
- return ftp://${userPwd}$components(host)${port}$components(path)$type
- }
-
- proc uri::SplitHttps {url} {
- uri::SplitHttp $url
- }
-
- proc uri::SplitHttp {url} {
- # @c Splits the given http-<a url> into its constituents.
- # @a url: The url to split, without! scheme specification.
- # @r List containing the constituents, suitable for 'array set'.
-
- # general syntax:
- # //<host>:<port>/<path>?<searchpart>
- #
- # where <host> and <port> are as described in Section 3.1. If :<port>
- # is omitted, the port defaults to 80. No user name or password is
- # allowed. <path> is an HTTP selector, and <searchpart> is a query
- # string. The <path> is optional, as is the <searchpart> and its
- # preceding "?". If neither <path> nor <searchpart> is present, the "/"
- # may also be omitted.
- #
- # Within the <path> and <searchpart> components, "/", ";", "?" are
- # reserved. The "/" character may be used within HTTP to designate a
- # hierarchical structure.
- #
- # path == <cwd1> "/" ..."/" <cwdN> "/" <name> ["#" <fragment>]
-
- upvar #0 [namespace current]::http::search search
- upvar #0 [namespace current]::http::segment segment
-
- array set parts {host {} port {} path {} query {}}
-
- set searchPattern "\\?(${search})\$"
- set fragmentPattern "#(${segment})\$"
-
- # slash off possible query
-
- if {[regexp -indices -- $searchPattern $url match query]} {
- set from [lindex $query 0]
- set to [lindex $query 1]
-
- set parts(query) [string range $url $from $to]
-
- set url [string replace $url [lindex $match 0] end]
- }
-
- # slash off possible fragment
-
- if {[regexp -indices -- $fragmentPattern $url match fragment]} {
- set from [lindex $fragment 0]
- set to [lindex $fragment 1]
-
- set parts(fragment) [string range $url $from $to]
-
- set url [string replace $url [lindex $match 0] end]
- }
-
- if {[string match "//*" $url]} {
- set url [string range $url 2 end]
-
- array set parts [GetHostPort url]
- }
-
- set parts(path) [string trimleft $url /]
-
- return [array get parts]
- }
-
- proc uri::JoinHttp {args} {
- eval uri::JoinHttpInner http 80 $args
- }
-
- proc uri::JoinHttps {args} {
- eval uri::JoinHttpInner https 443 $args
- }
-
- proc uri::JoinHttpInner {scheme defport args} {
- array set components [list \
- host {} port $defport path {} query {} \
- ]
- array set components $args
-
- set port {}
- if {[string length $components(port)] && $components(port) != $defport} {
- set port :$components(port)
- }
-
- set query {}
- if {[string length $components(query)]} {
- set query ?$components(query)
- }
-
- regsub -- {^/} $components(path) {} components(path)
-
- return $scheme://$components(host)$port/$components(path)$query
- }
-
- proc uri::SplitFile {url} {
- # @c Splits the given file-<a url> into its constituents.
- # @a url: The url to split, without! scheme specification.
- # @r List containing the constituents, suitable for 'array set'.
-
- upvar #0 [namespace current]::basic::hostname hostname
- upvar #0 [namespace current]::basic::hostnumber hostnumber
-
- if {[string match "//*" $url]} {
- set url [string range $url 2 end]
-
- set hostPattern "^($hostname|$hostnumber)"
- switch -exact -- $::tcl_platform(platform) {
- windows {
- # Catch drive letter
- append hostPattern :?
- }
- default {
- # Proceed as usual
- }
- }
-
- if {[regexp -indices -- $hostPattern $url match host]} {
- set fh [lindex $host 0]
- set th [lindex $host 1]
-
- set parts(host) [string range $url $fh $th]
-
- set matchEnd [lindex $match 1]
- incr matchEnd
-
- set url [string range $url $matchEnd end]
- }
- }
-
- set parts(path) $url
-
- return [array get parts]
- }
-
- proc uri::JoinFile args {
- array set components {
- host {} port {} path {}
- }
- array set components $args
-
- switch -exact -- $::tcl_platform(platform) {
- windows {
- if {[string length $components(host)]} {
- return file://$components(host):$components(path)
- } else {
- return file://$components(path)
- }
- }
- default {
- return file://$components(host)$components(path)
- }
- }
- }
-
- proc uri::SplitMailto {url} {
- # @c Splits the given mailto-<a url> into its constituents.
- # @a url: The url to split, without! scheme specification.
- # @r List containing the constituents, suitable for 'array set'.
-
- if {[regexp -- @ $url]} {
- set url [::split $url @]
- return [list user [lindex $url 0] host [lindex $url 1]]
- } else {
- return [list user $url]
- }
- }
-
- proc uri::JoinMailto args {
- array set components {
- user {} host {}
- }
- array set components $args
-
- return mailto:$components(user)@$components(host)
- }
-
- proc uri::GetUPHP {urlvar} {
- # @c Parse user, password host and port out of the url stored in
- # @c variable <a urlvar>.
- # @d Side effect: The extracted information is removed from the given url.
- # @r List containing the extracted information in a format suitable for
- # @r 'array set'.
- # @a urlvar: Name of the variable containing the url to parse.
-
- upvar #0 [namespace current]::basic::user user
- upvar #0 [namespace current]::basic::password password
- upvar #0 [namespace current]::basic::hostname hostname
- upvar #0 [namespace current]::basic::hostnumber hostnumber
- upvar #0 [namespace current]::basic::port port
-
- upvar $urlvar url
-
- array set parts {user {} pwd {} host {} port {}}
-
- # syntax
- # "//" [ <user> [":" <password> ] "@"] <host> [":" <port>] "/"
- # "//" already cut off by caller
-
- set upPattern "^(${user})(:(${password}))?@"
-
- if {[regexp -indices -- $upPattern $url dummy theUser c d thePassword]} {
- set fu [lindex $theUser 0]
- set tu [lindex $theUser 1]
-
- set fp [lindex $thePassword 0]
- set tp [lindex $thePassword 1]
-
- set parts(user) [string range $url $fu $tu]
- set parts(pwd) [string range $url $fp $tp]
-
- set matchEnd [lindex $user 1]
- incr matchEnd
-
- set url [string range $url $matchEnd end]
- }
-
- set hpPattern "^($hostname|$hostnumber)(:($port))?"
-
- if {[regexp -indices -- $hpPattern $url match theHost c d e f g h thePort]} {
- set fh [lindex $theHost 0]
- set th [lindex $theHost 1]
-
- set fp [lindex $thePort 0]
- set tp [lindex $thePort 1]
-
- set parts(host) [string range $url $fh $th]
- set parts(port) [string range $url $fp $tp]
-
- set matchEnd [lindex $match 1]
- incr matchEnd
-
- set url [string range $url $matchEnd end]
- }
-
- return [array get parts]
- }
-
- proc uri::GetHostPort {urlvar} {
- # @c Parse host and port out of the url stored in variable <a urlvar>.
- # @d Side effect: The extracted information is removed from the given url.
- # @r List containing the extracted information in a format suitable for
- # @r 'array set'.
- # @a urlvar: Name of the variable containing the url to parse.
-
- upvar #0 [namespace current]::basic::hostname hostname
- upvar #0 [namespace current]::basic::hostnumber hostnumber
- upvar #0 [namespace current]::basic::port port
-
- upvar $urlvar url
-
- set pattern "^(${hostname}|${hostnumber})(:(${port}))?"
-
- if {[regexp -indices -- $pattern $url match host c d e f g h thePort]} {
- set fromHost [lindex $host 0]
- set toHost [lindex $host 1]
-
- set fromPort [lindex $thePort 0]
- set toPort [lindex $thePort 1]
-
- set parts(host) [string range $url $fromHost $toHost]
- set parts(port) [string range $url $fromPort $toPort]
-
- set matchEnd [lindex $match 1]
- incr matchEnd
-
- set url [string range $url $matchEnd end]
- }
-
- return [array get parts]
- }
-
- # uri::resolve --
- #
- # Resolve an arbitrary URL, given a base URL
- #
- # Arguments:
- # base base URL (absolute)
- # url arbitrary URL
- #
- # Results:
- # Returns a URL
-
- proc uri::resolve {base url} {
- if {[string length $url]} {
- if {[isrelative $url]} {
-
- array set baseparts [split $base]
-
- switch -- $baseparts(scheme) {
- http -
- https -
- ftp -
- file {
- if {[string match /* $url]} {
- array set relparts [split "$baseparts(scheme)://$url"]
- array set baseparts [list path $relparts(path)]
- catch {array set baseparts [list query $relparts(query)]}
- return [eval join [array get baseparts]]
- } elseif {[string match */ $baseparts(path)]} {
- return ${base}$url
- } else {
- set path [lreplace [::split $baseparts(path) /] end end]
- array set baseparts [list path [::join $path /]/$url]
- return [eval join [array get baseparts]]
- }
- }
-
- default {
- return -code error "unable to resolve relative URL \"$url\""
- }
- }
-
- } else {
- return $url
- }
- } else {
- return $base
- }
- }
-
- # isrelative --
- #
- # Determines whether a URL is absolute or relative
- #
- # Arguments:
- # url URL to check
- #
- # Results:
- # Returns 1 if the URL is relative, 0 otherwise
-
- proc uri::isrelative url {
- return [expr {![regexp -- {^[a-z0-9+-.][a-z0-9+-.]*:} $url]}]
- }
-
- # geturl --
- #
- # Fetch the data from an arbitrary URL.
- #
- # This package provides a handler for the file:
- # scheme, since this conflicts with the file command.
- #
- # Arguments:
- # url address of data resource
- # args configuration options
- #
- # Results:
- # Depends on scheme
-
- proc uri::geturl {url args} {
- array set urlparts [split $url]
-
- switch -- $urlparts(scheme) {
- file {
- return [eval file_geturl [list $url] $args]
- }
-
- default {
- package require $urlparts(scheme)
-
- return [eval [list $urlparts(scheme)::geturl $url] $args]
- }
- }
- }
-
- # uri::file_geturl --
- #
- # geturl implementation for file: scheme
- #
- # TODO:
- # This is an initial, basic implementation.
- # Eventually want to support all options for geturl.
- #
- # Arguments:
- # url URL to fetch
- # args configuration options
- #
- # Results:
- # Returns data from file
-
- proc uri::file_geturl {url args} {
- variable file:counter
-
- set var [namespace current]::file[incr file:counter]
- upvar #0 $var state
- array set state {data {}}
-
- array set parts [split $url]
-
- set ch [open $parts(path)]
- # Could determine text/binary from file extension,
- # except on Macintosh
- # fconfigure $ch -translation binary
- set state(data) [read $ch]
- close $ch
-
- return $var
- }
-
- # uri::join --
- #
- # Format a URL
- #
- # Arguments:
- # args components, key-value format
- #
- # Results:
- # A URL
-
- proc uri::join args {
- array set components $args
-
- return [eval [list Join[string totitle $components(scheme)]] $args]
- }
-
- # uri::canonicalize --
- #
- # Canonicalize a URL
- #
- # Acknowledgements:
- # Andreas Kupries, a.kupries@westend.com
- #
- # Arguments:
- # uri URI (which contains a path component)
- #
- # Results:
- # The canonical form of the URI
-
- proc uri::canonicalize uri {
-
- # Make uri canonical with respect to dots (path changing commands)
- #
- # Remove single dots (.) => pwd not changing
- # Remove double dots (..) => gobble previous segment of path
-
- while {[regexp -- {/\./} $uri]} {
- regsub -all -- {/\./} $uri {/} uri
- }
- while {[regexp -- {/\.\./} $uri]} {
- if {![regsub -- {/[^./]*/\.\./} $uri {/} uri]} {
- # The regexp found 'foo://bar.com/../baz', but this
- # cannot be handled by the regsub. Simply remove the
- # dots, as is done for the singles to break out of
- # infinity.
- regsub -- {/\.\./} $uri {/} uri
- }
- }
-
- return $uri
- }
-
-