#!/usr/bin/env perl use warnings; use strict; # OK. First of all. We know that this LOOKS ugly - but we promise -- it's not. # # jawk. like 'awk', but Joshy and Perly. # # USES NO EXTERNAL MODULES #use Memoize; # we're actually not even using memoize now. # # one test showed that using memoize() for # # convert_args_to_fields() sped up runtime by ~25% # our $VERSION = 0.02; my $prog = "jawk"; my $exclude; my $delimiter = ' '; my $joiner = ' '; my $newline = 0; # don't auto-add newlines my $awky = 0; # if we're awky then we use $1 $2 $3 and not @F my $debug = 0; my $warnings = 0; # do we show warnings for code run via -e ? # jawk: like awk '{print $N}', and much more. Use like # 'ps -auxwww | grep something | jawk 2' # # NOTE: use -- option to pass files on the command line. # # We _Don't_ use Getopt::Long, because we're not sure how to # shoehorn our argument processing logic into it. # # Specifically, we need to handle negative and positive numbers # (let's call them A and B) and various A..B-type ranges as options. # (e.g.: A, ..B, A.., and A..B). # # Note that allowing un-hyphened options which are not filenames is convenient... # but probably breaks POSIX recommendations. # # Our closest 'role model' (other than the anachronistic awk) # is cut, which uses -f 'fields' to specify fields, and allows ranges, but no # negative ranges, exclusion options, or perl support. # ############################################ main(); ############################################ sub Usage { return "$prog [-x] [-e 'code'] [-d delim] fieldspec [fieldspec...] [-- (FILES..)]:\n" . " jawk 1 is somewhat like awk '{print \$1}'.\n" . " jawk also allows ranges with '..'. For example:\n" . " Fieldspec can be like A, A..B, A.., or ..B (A and B are + or - ints).\n" . " Negative values for A and B count backwards so -1 is the last field.\n" . " (NOTE: use -- or - FILENAME.txt to read from files\n" . " '--' is needed to treat FILENAME.txt as file and not fieldspec.)\n"; " -d delimiter\n" . " -D turns on DEBUG mode\n" . " -j joiner\n" . " -x means _don't_ show the numbered fields, and show the others.\n" . " -n means add a newline (when in -e mode).\n" . " -e 'perlcode' : more like awk but with \@F, and perl\n" . " -w means 'use warnings' for perlcode run via -e\n"; #" -a is for old (deprecated) 'awky' mode with \$1 \$2 etc (from perl)\n" . } ############################################ sub main { #memoize( 'convert_args_to_fields' ); # this speeds up named-query-stats from 2.33 to 1.7 minutes (~25%). #memoize( 'invert_fields' ); # this doesn't work (as implemented so far) because we pass fields by reference. my @args; my $exe = ""; # whatever perl code we should execute for each line # HANDLE Command Line Processing... Manually! # # shift items off @ARGV, processing as we go along, # putting fields like 1, 1.. or 1..2 etc into @args # # With all the code below, we've finally replicated most of Getopt::Long's -- option :). # Except Getopt::Long doesn't store away @args and @ARGV separately like we do. ARGVLOOP: while( defined($_ = shift @ARGV) ) { # manual handling of command-line options if (/^-x/) { $exclude = 1; } # -x option elsif (/^-n/) { $newline = 1; } # -n option #elsif (/^-a/) { $awky = 1; } # -a is DEPRECATED # -a option, $1 instead of @F # AWK MODE IS DISABLED elsif (/^-D/) { $debug = 1; } # debug elsif (/^-w/) { $warnings = 1; } # turn on perl warnings -e code elsif (/^-e$/) { $exe .= shift( @ARGV ) || die "$prog: -e (exe) needs param\n"; } elsif (/^-e=(.*)$/) { $exe .= $1 || die "$prog: -e (exe) needs param\n"; } elsif (/^-d$/) { $delimiter = shift( @ARGV ) || die "$prog: -d (delim) needs param\n"; } elsif (/^-d=(.*)$/) { $delimiter = $1 || die "$prog: -d (delim) needs param\n"; } elsif (/^-j$/) { $joiner = shift( @ARGV ) || die "$prog: -j (joiner) needs param\n"; } elsif (/^-j=(.*)$/) { $joiner = $1 || die "$prog: -j (joiner) needs param\n"; } elsif (/^-\d+/) { push(@args, $_); } # negative digits elsif (/^--?$/) { last ARGVLOOP; } # stop processing at - or -- #elsif (/^-/) { die "$prog: Option not understood: $_\n" . Usage(); } # other -options elsif (/^-/) { die "$prog: Option not understood: $_\n" . Usage(); } # other -options else { push(@args, $_); } # ok; non-hyphenated option like digit or .. } #print "$prog: args passed: @ARGV\n" if $verbose; warn "$prog: Doesn't make sense to use numbered fields and -e, fields ignored\n" if ($exe && @args); my $quote_meta_delimiter = defined($delimiter) ? quotemeta( $delimiter ) : ""; # read lines with the magical diamond operator. note use of '--' option, documented above. while( defined( my $line = <> ) ) { chomp($line); # split the line into parts my @parts; if ($delimiter eq ' ' ) { @parts = split( ' ', $line); # ' ' is a special case with split, which acts special. look it up! } else { @parts = split(/$quote_meta_delimiter/, $line); # so you can split on chars like "(" #shift(@parts) while (@parts && $parts[0] =~ /^\s*$/); # should we strip leading blank fields? } if ($exe) { # if they passed a line to execute, then run it for each line we read my $exe_expanded = replace_exe_vars( $exe, \@parts ); # expand to perl script eval "$exe_expanded"; # string eval. warn "$prog: Error running: $exe_expanded: $@\n" if $@; print "\n" if $newline; # -n is "newline" mode. } else { # otherwise, pull out fields via numbered args. # convert the args (things from @ARGV that don't look like command-line options) # into fields. Must be done for each line, because we need the # number of elements. print STDERR "args are @args, parts are @parts\n" if $debug; my @fields = convert_args_to_fields( \@args, scalar(@parts) ); # if we're in -x mode, invert the fields to # figure out which are left after exclusions. if ($exclude) { @fields = invert_fields( \@fields, scalar(@parts) ); } # show the fields we want from @parts print (join($joiner, @parts[@fields]) . "\n"); } } exit(0); # done } ############################################ # my $exe_expanded = replace_exe_vars( $exe, \@fields ) # since we can't actually assign to $1, $2, $3, etc easily, # we manually parse out $\d+ and ${\d+} sequences from the exe string they pass # and pass back a string to be eval'ed :) # Apologies all around. sub replace_exe_vars { my ($exe, $fieldsref) = @_; my @fields = @$fieldsref; # AWK MODE IS DISABLED ABOVE if ($awky) { # awky style, to be deprecated. Replace $1 $2 $3 etc #print "incoming exe: $exe; [@fields]\n"; #$exe =~ s/ \$ ([0-9]+) /my $c = $fields->[\$1-1];\\$c/geex; # $fields->[$1-1]/xg; while( $exe =~ m/ \$ ([0-9]+) /x) { my $field = $1; my $search = '\$' . $field; my $replace = ($field > 0 && $field <= @$fieldsref) ? $fieldsref->[$field-1] : ""; #print "$prog: Replacing field $field: $search with $replace\n"; $exe =~ s/ $search /$replace/xg; } } else { # non awky style, this is recommended # $exe holds the code to run on @F use vars qw( @F ); # make a global @F @F = @$fieldsref; # yes we use 0-th element, because @F IS NORMAL PERL! my $tmpexe = "no strict; "; $tmpexe .= "no warnings; " unless $warnings; $tmpexe .= $exe; $exe = $tmpexe; } #print "outgoing exe: $exe\n"; return $exe; } ######################################################### # convert_args_to_fields( $args_ref, $numparts_in_line ) # args come in 1-based, and are returned 0-based # handles ranges like 1..3 or 3..1 # also handles negative args alone or in ranges, # like -2..1, or like -1..1 sub convert_args_to_fields { my ($args_ref, $numparts) = @_; return () unless $numparts; my @ret = @$args_ref; # convert the user's fields to field numbers within the fields # NOTE: ignores indexes outside existing fields print STDERR "$prog: 0: initially (@ret)\n" if $debug; # first, deal with negative indices by replacing them with their positive versions for (@ret) { while (/(-\d+)/) { # look for negative numbers. my $p = $1 + $numparts + 1; s/$1/$p/; # replace them with their positive versions, one at a time } } # parse the command line arguments for ints and ranges like a..b , a.. , and b.. . # --we've already replaced negative vals with their pos versions above # (This could be broken up into multiple steps for clarity) @ret = ( map { /^\d+$/ ? ($_) : # ** a single int /^(\d+)\.\.(\d+)$/ ? (get_range($1, $2)) : # ** an int range /^(\d+)\.\.$/ ? (get_range($1, max($1, $numparts))) : # ** an integer and up /^\.\.(\d+)$/ ? (get_range(1, $1)) : # ** up to an integer die "$prog: Don't know how to handle field '$_'\n" . Usage(); } @ret); print STDERR "$prog: 1: modified to (@ret)\n" if $debug; @ret = grep { $_ <= $numparts && $_ >= 1 } @ret; # match only indexes that we have a value for print STDERR "$prog: 2: modified to (@ret)\n" if $debug; @ret = map { $_ - 1 } @ret; # shift each int down by one; 1-based to 0-based print STDERR "$prog: 3: modified to (@ret)\n" if $debug; return @ret; } ############################################ # invert_fields( [1, 2], 3 );# ( [activated], num_fields ) # choose the opposite of whatever's currently selected from num_fields. # all indexes have been converted to non-negatives already, and are 0-based. sub invert_fields { my ($fieldsref, $numparts) = @_; my %fieldshash; # for exclusion @fieldshash{ @$fieldsref } = (); # set @$fieldsref names as keys my @inversefields = grep { !exists($fieldshash{$_}) } ( 0 .. $numparts-1); # this is correct return @inversefields; # all the other fields } ############################################ # get_range( $a, $b ) # returns all the ints from $a to $b, inclusive # handles descending lists, unlike '..' :) sub get_range { my ($a, $b) = @_; if ( $a <= $b ) { return ($a..$b); } return (reverse($b..$a)); # they asked for a list in reverse. make it ascending, and reverse it. # or as Missy Elliott would say - flip it and reverse it. } ############################################ # max( $a, $b ) # returns the max of the two sub max { my ($a, $b) = @_; return $a if ($a >= $b); return $b; } ############################################ =pod =head1 NAME jawk -- like awk, but post-modern and perly. AKA, Josh's Awk. =head1 SYNOPSIS jawk [-x] [-e 'code'] [-d delim] fieldspec [fieldspec...] [-- (FILES..)]: If you haven't seen awk, then jawk can be described as a flexible tool for extracting columns of data from text files. If you've seen 'awk', then we can describe jawk as a replacement for statements like awk '{print $N}' which supports ranges, indexing columns by negative numbers, a perl mode, and more. =head1 DESCRIPTION jawk 1 is somewhat like awk '{print $1}'. Let's start with a fairly complex example. Suppose you have a file called 'names' with lines of data in this format: Bob Elmer, 2716 Fremont Blvd, New York, NY, 12344, ID:91818, CanastaRating:3.1415 Elmer Fudd, 1 Bunny Hill Drive, Tarrytown, NY, 87654, ID:1, CanastaRating:123456789 This statement would pull out the 1st, and 3rd through last columns, using ', ' as an input delimiter (we've put two spaces between options, for clarity): jawk -d', ' 1 3..-1 -- names.txt Note the use of C<-d>, negative indexes, the non-default element delimiter via C<-d>, and the C<--> anti-option (which indicates that following arguments should be considered files to read). jawk also allows ranges with C<..>. For example, a field specification can look like C, C, C, or C<..B>, where (C and C can be negative or positive integers. Negative values for A and B count backwards, so -1 is the last field. Use -- or - FILENAME.txt to read from files. '--' is needed to treat FILENAME.txt as file and not fieldspec. See examples below. Where you might previously use a command like grep pattern file.txt | awk '{print $2}' to pull out the 2nd column from a file, you can now do: grep pattern file.txt | jawk 2 jawk offers many other improvements. Here are examples: select out the 1st, 3rd, and 4th columns from file cat file | jawk 1 3 4 select all columns except the 1st, and 9th through remaining. Uses the -x option for an 'except' meaning. cat file | jawk -x 1 9..-1 select out the first through third, and the second to last, and last cols from a file. cat file | jawk 1..3 -2 -1 Same as above, but using : as an input delimiter instead of whitespace. Note use of -- to start list of files to read from @ARGV, so we can pass C to jawk directly instead of through C. jawk -d: 1..3 -2 -1 -- file There is also a -exe='perlcode' mode where you access the args via @F, and not via named positional args. Like so: cat file | jawk -e 'print "@F\n";' =head1 OPTIONS Here's an explation of all the command-line options: =over 4 =item C A field specification option indicating that this particular column should (or should not, depending on -x, be output). Negative indexes count from the right, like in perl, so the right-most column is number C<-1>. =item C Integer ranges are specified with C<..>, and given that A and B are non-zero integers, can look like C C C<..B> If you specify ranges in reverse order from their source, like C or C you'll get the fields in revers order, like you asked. =item -d delimiter (or -d=delimiter) Specify an alternate delimiter in place of '\s+'. If not ' ', the delimiter is processed through perl's quotemeta() function and used as a regular expression to match between input fields. =item -j joiner (or -j=joiner) Specify an alternate join character sequence in place of 'space'. =item -x Exclude the chosen columns, negating their meaning. Does not interoperate with -e 'perlcode' option. =item -n Add a newline at the end of each line output. Intended for use with -e 'perlcode' option. =item -e='perlcode' or -e 'perlcode' Use perl code passed to process parsed items. Fields come in through the @F array, and are 0-indexed (like in perl) instead of 1-indexed (like in jawk and cut). A simple example, which shows the first and second columns of input, is cat file.txt | jawk -e 'print "$F[0] $F[1]\n"' =item -w Use perl code passed to process parsed items. Fields come in through the @F array, and are 0-indexed =item -- Ends argument parsing. Used to pass filenames to read from stdin. See examples above. =back =head1 BUGS None known =head1 COPYRIGHT Copyright (c) 2011 Josh Rabinowitz, All Rights Reserved. =head1 AUTHORS Josh Rabinowitz =cut