home *** CD-ROM | disk | FTP | other *** search
- Newsgroups: comp.lang.perl
- Path: sparky!uunet!caen!sol.ctr.columbia.edu!IDA.ORG!rlg
- From: rlg@IDA.ORG (Randy garrett)
- Subject: Fast String Operations?
- Message-ID: <1992Sep10.220438.21395@IDA.ORG>
- Organization: IDA, Alexandria, VA
- X-Newsreader: Tin 1.1 PL4
- Date: Thu, 10 Sep 92 22:04:38 GMT
- Lines: 404
-
- Here at last is a long delayed summary.
-
- The original question had to do with replacing all occurrences
- of "||" in a string with either a ~ or a -1 depending on
- the value of another array. This is basically a database
- conversion process, where all fields that are of type int ought
- to get a NULL value of -1 and all fields of type char should get
- a ~ . For instance, given a string "abc|123|||def|" and a
- corresponding array (1,2,1,2,2,1) produce the string
- "abc|123|-1|~|def|-1", if we assume an array value of 1 means int
- and an array value of 2 means ~ .
-
- I got some great suggestions for improving the speed of my code.
- Thanks very much for all the help!
-
- The code I posted was a subset of the actual code, so a few
- of the suggestions were difficult to merge into the real code.
- My explanation of some of the fine points also left something
- to be desired.
-
- Finally, I just ran out of time to do testing on some of the
- submissions. Sorry! If time permits, I will also test those.
- Also, I munged the submitted code slightly to permit benchmarking.
- If I lost the original intent in doing so, please correct me.
-
- The system I tested on was a Sun 4/490 running SunOS 4.1.1 and
- perl-4.019 compiled using the Sun bundled C compiler. A better
- compiler might speed all these results up. I identified the
- submissions by FS0 thru FS4. I did not finish benching
- submissions FS5 thru FS 9. Code and their authors is at the
- end of this message. I am extremely sorry to say that I lost
- the author's names of FS3 and 4. Please forgive me, and if
- you read this send me a reply.
-
- For a 10,000 character length string, all NULLS: (extreme test case)
- All times the sum of user and system CPU seconds. Also, it took
- about 1.0 second just to init the string.
-
- FS0: 21
- FS1: 10.6
- FS2: 13.5
- FS3: 4.8
- FS4: 10
-
- As you can see all the submissions were about twice as fast as
- my original, and FS3 was about 4x faster.
-
- For a more realistic test, I ran a loop with 4000 iterations with
- a string of 25 Nulls. Just running a 4000 iteration loop with
- no ops inside took .3 seconds.
-
- FS0: 63
- FS1: 37
- FS2: 39
- FS3: 18
- FS4: 35
-
- Since FS3 looked like the winner, I did more tests on it.
-
- # of loops / # of Nulls Time
-
- 1 / 10000 4.8
- 1 / 5000 2.0
- 1 / 1000 .5
- 1 / 100 .2
-
- 100 / 100 2.2
- 1000 / 100 19.4
- 4000 / 100 76.0
-
- 1000 / 25 4.5
- 4000 / 25 18.0
-
- ######### Actual Code ################
-
- ########### FS0: by me -- Randy Garrett #########
-
- #!/usr/local/bin/perl
- # Walk thru a string; if find "||" inset either a ~ or a -1
- # depending on value of @name
- $Num = 25;
- #$string = "a|bcd||g||i||||||";
- $str = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"1"); }
- # print "$string\n";
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- for ($j = 0; $j < 4000; ++$j)
- {
- $count = 0; # index into @name
- $pos = 0; # index into $string
- $string = $str;
- while (($pos = index($string,'|',$pos)) >= 0) {
- # print "Found at $pos $count = $name[$count]\n";
- if (substr($string,$pos+1,1) eq "|" ) { # found a NULL
- if ($name[$count] == 1) { # int ?
- substr($string,$pos+1,0) = -1; }
- elsif ($name[$count] == 2 ) { # char?
- substr($string,$pos+1,0) = "~"; }
- }
- $pos++; $count++;
- }
- # print "$string\n";
- }
- print "$string\n";
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- ################### FS1 #########################
- # Hal R. Pomeranz -- pomeranz@nas.nasa.gov
- #!/usr/local/bin/perl
-
-
- $Num = 25;
- $string = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"1"); }
- #$string = "a|bcd||g||i|||";
- #@name = (1,2,1,1,2,1,2,1);
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- for ($j = 0; $j < 4000; ++$j)
- {
-
- @cols = split(/\|/, $string);
- for ($i = 0; $i < @name; $i++) {
- $cols[$i] = ($name[$i] == 1 ? -1 : "~") unless ($cols[$i]);
- }
- # print join('|', @cols), "|\n";
-
- join('|', @cols);
- }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- ########## FS 2 ###############
- # Christopher Davis -- ckd@eff.org
- #
-
- #!/usr/local/bin/perl -- # -*-Perl-*-
- # $string = "a|bcd||g||i|||";
- # @name = (1,1,2,1,1,2,1,2,1); # yours was off-by-one, I added 1 at start
-
- $Num = 25;
- $string = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"1"); }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- for ($j = 0; $j < 4000; ++$j)
- {
-
- $nf = scalar(@name); # number of fields
- undef @string;
- foreach (@name) { # give us a nice boolean
- push(@string,$_ - 1);
- }
-
- #@fields =split('\|',$string,$nf); # $nf will force it to keep nulls at end
- @fields =split('\|',$string);
-
- undef @newfields;
-
- foreach (@string) { # i.e. for each field
- $thisfield = shift(@fields); # take it off the old list
- $thisfield = ($_?"~":-1) unless $thisfield; # if it's null, fix it
- push(@newfields,$thisfield); # put it on the new list
- }
-
- # $newstring = join("|",@newfields,"|"); # add a pipe at the end
- $newstring = join("|",@newfields); # add a pipe at the end
- # print $newstring,"\n";
- }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
-
- ################### FS3 ###########################
-
- #!/usr/local/bin/perl
- # Walk thru a string; if find "||" insert either a ~ or a -1
- # depending on value of @name
-
- $Num = 25;
- $string = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"2"); }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- #$string = "a|bcd||g||i|||";
- #@name = (1,2,1,1,2,1,2,1);
-
- %defaults = (1, '-1', 2, "~"); # Specify defaults for each of your types
-
- for ($j = 0; $j < 4000; ++$j)
- {
-
- @strings = split(/\|/, $string);
- for $count ( 0.. $Num )
- # foreach $count ( @strings )
- {
- $strings[$count] = $defaults{ $name[$count] } if $strings[$count] eq '';
- }
- $string = join("|", @strings);
- # print "$string\n";
- }
- print "$string\n";
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- ##################### FS4 #########################
-
- #!/usr/local/bin/perl
-
- #$string = "a|bcd||g||i|||";
- #@name = (1,2,1,1,2,1,2,1);
-
-
- $Num = 25;
- $string = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"1"); }
-
- for ($j = 0; $j < 4000; ++$j)
- {
-
- # if ($string =~ /\|\|/)
- {
-
- # chop ($string);
- @field = split (/\|/, $string, @name +1);
- for ($index = 0; $index < @field; $index++)
- {
- $field[$index] = ('', '-1', '~')[$name[$index]] if $field[$index] eq '';
- }
- $newstring = join ('|', @field);
- }
-
- # print "$newstring\n";
- }
- ($u, $s, $cs, $cu) = times;
- print "Time for $j = $u $s\n";
-
- ####### Following code not benchmarked ############
- ######################### FS 5 ###################
- # Raymond Chen -- rjc@math.Princeton.EDU
- #! /usr/local/bin/perl
- $string = "a|bcd||g||i|||";
- @default = ("-1","~", "-1", "-1", "~", "-1", "~", "-1","-1");
-
- @F = split(/\|/, $string);
- for ($i = 0; $i < $#F; $i++) { # unroll this loop for speed
- $F[$i] = $default[$i] unless $F[$i];
- }
- print join("|", @F), "\n";
- ######################## FS 6 ####################
- # From Mike Flynn (flynn_mike@jpmorgan.com)
-
- #!/usr/local/bin/perl
- # Walk thru a string; if find "||" insert either a ~ or a -1
- # depending on value of @name
-
- #@name = (1,2,1,1,2,1,2,1);
- $Num = 25;
- $string = "|" x $Num;
- for ($j = 0; $j < $Num; ++$j)
- { push (@name,"1"); }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- for ($j = 0; $j < 40; ++$j)
- {
- $count = 0;
- foreach $element (split(/\|/, $string)) {
- print "element = $element\n";
- if ($element) {
- print $element;
- print "|" unless ($element eq "\n");
- } else {
- print "-1|" if ($name[$count] == 2);
- print "~|" if ($name[$count] == 1);
- }
- $count++;
- }
-
- }
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
- ######################### FS 7 ####################
- # From Larry Wall!
-
- #!/usr/local/bin/perl -- # -*-Perl-*-
- # Call this with test.data
- $/ = '|';
- #$Num = 25;
- #$string = "|" x $Num;
- #for ($j = 0; $j < $Num; ++$j)
- # { push (@name,"2"); }
-
- #($u, $s, $cs, $cu) = times;
- #print "Time = $u $s\n";
-
- @default = (-1,'~',-1,-1,'~',-1,'~',-1,-1,-1,-1,-1,'~',-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,'~');
-
- while (<>) {
- $field = 0, print "\n" if s/^\n//;
- next unless /^\|/;
- # print $default[$field];
- }
- continue {
- # print;
- ++$field;
- ++$k;
- }
-
- # print "$string\n";
-
- ($u, $s, $cs, $cu) = times;
- print "Time = $u $s\n";
-
-
- ########### Finally a submission to do it in C #######
- # From Larry Wall, of all people (:-^)
- #
- # Unfortunately this code does a lot of other things
- # that I don't want to recode it in C and I'm too lazy
- # to learn to add my own user subroutines to Perl right now
- #
-
- #include <stdio.h>
-
- char *def[] = {"-1","~","-1","-1","~","-1","~","-1"};
-
- main() {
- int ch;
- int lastch;
- int field = 0;
-
- while ((ch = getc(stdin)) != EOF) {
- if (ch == '\n')
- field = 0;
- else if (ch == '|') {
- if (lastch == '|')
- fputs(def[field], stdout);
- field++;
- }
- putc(ch, stdout);
-
-
- lastch = ch;
- }
- }
-
- ############### FS8 ###########################
- # John Stoffel -- <john@wpi.WPI.EDU>
- $a = split(/\|/,$string,100);
-
- foreach $x (0 .. ($a-1)) {
- if ($_[$x] eq "") {
- if ($name[$x] == 1) { $_[$x] = "~"}
- else { $_[$x] = "-1"}
- }
- }
- print join('|',@_), "\n";
-
- ############### FS 9 ##########################
- # Unfortunately, I forgot to mention that the
- # strings are variable length, so it's not fair
- # to assume they are always length 8.
- # Thanks for your help, though ...
- # Chris Sherman -- sherman@unx.sas.com
- #!/usr/local/bin/perl
- $string = "a|bcd||g||i|||";
- @name = (0,1,0,0,1,0,1,0);
- @vars =
- ($string =~ /([^|]*)\|([^|]*)\|([^|]*)\|([^|]*)\|([^|]*)\|([^|]*)\|([^|]*)/);for ($i = 0; $i < 8; $i++)
- {
- if ($vars[$i] eq "")
- {
- if ($name[$i])
- {
- $vars[$i] = "-1";
- } else {
- $vars[$i] = "~";
- }
- }
- }
- print join("|",@vars),"|\n";
-
-