source: trunk/htdocs/perl/new_find_archives.pl @ 207

Last change on this file since 207 was 207, checked in by MatthewWhiting, 16 years ago

Some minor changes to find_archives, but a major rewrite of it in the form of new_find_archives, to try to get things working properly for cpsr2 AND dfb/wbc.

  • Property svn:executable set to *
File size: 16.5 KB
Line 
1#! /usr/bin/perl -w
2
3# find_archives.pl
4# Author: Matthew Whiting
5# Date: January 2008
6# ATNF, CSIRO
7
8# A modified version of Albert's find_archives.pl that hopefully works
9# a bit more consistently for the different data types.
10
11# Crawls through data archive directories (given as an input parameter),
12# searches for all observation and/or calibration files, enquires about
13# their pulsar parameters and inputs these details as well as their
14# location into a mySQL database
15
16use strict;
17use File::Find ();
18#use DBI qw(:sql_types);
19use POSIX;
20
21# for the convenience of &wanted calls, including -eval statements:
22use vars qw/*name *dir *prune @archive_dirs $archive_dir @files
23    $vap_cmd $vap_out $params $num_vap_params $db_handle
24    $filename $raj $decj $dm $period $bw $cnfg
25    $freq $inst $mjdint $mjdfrac $npol $nchan $nbin $nsub
26    $rcvr $site $length $obsrvr $data_loc $data_type
27    $file_size $MJD $raH $raM $raS $decD $decM $decS $rajd $decjd $decstring
28    $pi $pion180 $NGP_RA $NGP_DEC $ASC_NODE $draR $decR
29    $sinb $sinl $cosl $gb $gl $gl_raw
30    $bmaj $bmin $bpa $hdrver $survey $nbeam
31    $output $vap $archive_extensions $ext       
32    $default
33    @result_params $FILE_SIZE_INDEX $FB $DFB $WBC $CPSR2 $NULLFIELD
34    $params_NAME   
35    $params_PROJID 
36    $params_RAJ     
37    $params_DECJ   
38    $params_FREQ   
39    $params_BW     
40    $params_LENGTH 
41    $params_DATE   
42    $params_TIME   
43    $params_MJD
44    $params_BMAJ   
45    $params_BMIN   
46    $params_BPA     
47    $params_DM     
48    $params_PERIOD 
49    $params_NCHAN   
50    $params_NPOL   
51    $params_NBIN   
52    $params_NSUB   
53    $params_NBITS   
54    $params_TSAMP   
55    $params_NBEAM
56    $params_CNFG   
57    $params_INST   
58    $params_RCVR   
59    $params_HDRVER
60    $params_TELESCOP
61    $params_SITE   
62    $params_OBSRVR 
63    $params_FILENAME
64    /;  # $machine
65                                               
66*name   = *File::Find::name;
67*dir    = *File::Find::dir;
68*prune  = *File::Find::prune;
69
70# The data types
71$FB = "FB";
72$WBC = "WBC";
73$DFB = "DFB";
74$CPSR2 = "CPSR2";
75
76# filename extensions
77$archive_extensions = "rf cf cfb fb";
78
79# How to write NULL fields to the database
80$NULLFIELD = "NULL";
81
82# Pulsar parameters (listed in "vap -H")
83$params  = "name projid ra dec freq bw length stt_date stt_time mjd bmaj bmin bpa dm period nchan npol nbin nsub nbits tsamp nbeam beconfig backend rcvr hdrver telescop asite observer";
84
85# The location of vap
86$vap = "/pulsar/psr/linux/bin/vap -n";
87
88# vap result parameter indices. Used to locate the resulting
89# parameter's location
90# Note that we start at 1, since the vap output has the filename in the first position
91$params_NAME    =  1;
92$params_PROJID  =  2;
93$params_RAJ     =  3;
94$params_DECJ    =  4;
95$params_FREQ    =  5;
96$params_BW      =  6;
97$params_LENGTH  =  7;
98$params_DATE    =  8;
99$params_TIME    =  9;
100$params_MJD     = 10;
101$params_BMAJ    = 11;
102$params_BMIN    = 12;
103$params_BPA     = 13;
104$params_DM      = 14;
105$params_PERIOD  = 15;
106$params_NCHAN   = 16;
107$params_NPOL    = 17;
108$params_NBIN    = 18;
109$params_NSUB    = 19;
110$params_NBITS   = 20;
111$params_TSAMP   = 21;
112$params_NBEAM   = 22;
113$params_CNFG    = 23;
114$params_INST    = 24;
115$params_RCVR    = 25;
116$params_HDRVER  = 26;
117$params_TELESCOP= 27;
118$params_SITE    = 28;
119$params_OBSRVR  = 29;
120$params_FILENAME= 30;
121
122
123#####################
124# Begin main method
125#
126my @vap_params = split(/ /, $params);
127$num_vap_params = scalar(@vap_params);
128
129# The column number (starting from 0) of the file
130# size when calling ls -l
131$FILE_SIZE_INDEX = 4;
132
133# Use the default operation which is to firstly delete any
134# stale records, then crawl through and insert new records
135$default = 1;
136
137print "ARGV = @ARGV\n";
138
139if (scalar(@ARGV) == 0) {
140        print "find_archives.pl: A crawler that populates a relational database\n
141                                with indexes to observational data along with their respective\n
142                                cal files.\n\n
143                                Usage: find_archives.pl [options] directory1 [directory2 ...]\n\n
144                                If no options are provided, it will default to do both -d and -p\n
145                                Options:\n
146                                \t-d Delete stale records\n
147                                \t-p Populate database with new records\n";
148       
149}
150
151# Connect to the database
152$db_handle = connectdb();
153
154#########################################################
155# Step 1. First remove any stale entries in the database
156#         (remove file locations that no longer exist)
157#########################################################
158
159if ($ARGV[0] eq "-d") {
160#       deleteStaleRecords();
161        $default = 0;
162        shift @ARGV;
163}
164
165
166#########################################################
167# Step 2. Crawl through each input directory and insert
168#         any new archives
169
170if ($ARGV[0] eq "-p") {
171        shift @ARGV;
172        @archive_dirs = @ARGV;
173
174        foreach $archive_dir (@archive_dirs) {
175
176                # Traverse desired filesystems
177                File::Find::find({wanted => \&wanted}, $archive_dir);
178        }
179        $default = 0;
180}
181
182# Else do both
183if ($default) {
184
185#       deleteStaleRecords();
186
187        @archive_dirs = @ARGV;
188
189        foreach $archive_dir (@archive_dirs) {
190               
191                # Check that the input directory is legit
192               
193                # Traverse desired filesystems
194                File::Find::find({wanted => \&wanted}, $archive_dir);
195        }
196
197}
198#if ($db_handle->disconnect()) {
199#       print "Successfully Disconnected from database\n";
200#}
201#else {
202#       print "Error: Failed to disconnect from the database\n";
203#       exit;
204#}
205
206######################################################################
207
208######################################################################
209# Subroutine definitions
210#
211######################################################################
212
213#######################################################
214
215sub wanted {
216 
217       
218        if (-d and /^[a-zA-Z]{0,1}[0-9]{4}[+-][0-9]{4}.*/s ) {
219
220                print "\n\nIn directory $_\n\n";
221                print "\$File::Find::name = $File::Find::name\n";
222               
223                my $search_str;
224                foreach $ext (split (/\s+/, $archive_extensions) ) {
225                        $search_str .= $File::Find::name . "/*.$ext "
226                }
227
228                my @all_files = glob($search_str);
229               
230                if (scalar(@all_files) == 0) {
231                    # No data files means it's likely to be a CPSR2 directory
232
233                    # Foreach observation
234                    my @cpsr2_obs_dirs = glob($File::Find::name . "/????-??-??-??:??:??");
235                   
236                    my $cpsr2_obs_dir;
237                    foreach $cpsr2_obs_dir (@cpsr2_obs_dirs) {
238                        print "dir is $cpsr2_obs_dir\n";
239                               
240                        # Need to check if this entry already exists
241                        $cpsr2_obs_dir =~ s/\s//g;
242                       
243                        if (isDuplicate($cpsr2_obs_dir) == 1) {
244                            #print "$cpsr2_obs_dir is a duplicate entry. Skipping....\n\n";
245                            next;
246                        }                               
247
248                        #print "$cpsr2_obs_dir NOT a duplicate. Extracting data...\n";
249                        find_data_cpsr2($cpsr2_obs_dir);                       
250                       
251                    }
252                }
253                else{
254                    my $file;
255                    foreach $file (@all_files) {
256                       
257                        # Need to check if this entry already exists
258                        $file =~ s/\s//g;
259                       
260                        if (isDuplicate($file) == 1) {
261                            #print "$file is a duplicate entry. Skipping....\n\n";
262                            next;
263                        }                               
264
265                        #print "$file NOT a duplicate. Inserting into database\n";
266                        find_data($file);
267                       
268                       
269                    }
270
271                }
272
273            }
274    }
275               
276
277
278#######################################################
279
280sub find_data {
281   
282# Usage: find_data(filename)
283# Runs vap on the filename, and extracts parameters, then sends them to populate_database() to be written to the database
284#
285# NOTE: this is for non-CPSR2 data
286
287    my $file = $_[0];
288
289    my $vap_cmd = "$vap -c \"$params\" $file  | grep -v filename | grep -v -x \"\" |";
290    print "Calling vap...$vap_cmd\n";
291    open(VAP,$vap_cmd);
292    while ($vap_out = <VAP>) {
293        $vap_out =~ s/^\s+//g; #remove any leading space
294        my @result_params = split(/\s+/, $vap_out);
295       
296        populate_database(@result_params,$file);
297#       if (scalar(@result_params) == ($num_vap_params + 1)) {    # +1 because first column is filename
298#           print "@result_params\n";
299#           populate_observations($file);
300#       }
301#       else {
302#           print "Incorrect number of cols = ".scalar(@result_params).". Expected " . $num_vap_params+1 ."\n";
303#           print "@result_params\n";
304#       }
305    }
306    print "\n";
307 
308}
309
310#######################################################
311
312sub find_data_cpsr2 {
313   
314# Usage: find_data_cpsr2(filename) where filename = ????-??-??-??:??:??
315# Extracts parameters for the observation using vap, then sends them to populate_database() to be written to the database
316#
317# NOTE: this is only for CPSR2 data
318
319    my $dir = $_[0];
320    my $ext;
321    my @result_params;
322    my @tmp_params;
323    my $prefix = "m n o";
324
325    my @all_files_fb = glob($dir."/*.fb");
326    my @all_files_cfb = glob($dir."/*.cfb");
327    if ( scalar(@all_files_fb)!=0 ){
328        $ext = ".fb";
329    }
330    else{
331        $ext = ".cfb";
332    }
333    print "Searching in $dir for $ext files\n";
334
335    my $pre;
336    foreach $pre (split (/\s+/, ($prefix))) {
337
338        my @all_files;
339        my $search_str = $dir."/".$pre."*".$ext;
340        print "$pre: Searching for files like: $search_str\n";
341        @all_files = glob($search_str);
342        print "Found ". scalar(@all_files) . "\n";
343
344        if(scalar(@all_files) != 0) {
345   
346            # Get most parameters from first file
347            my $vap_cmd = "$vap -n -c \"$params\" $all_files[0]  | grep -v filename | grep -v -x \"\" |";
348            open(VAP,$vap_cmd);
349            my $vap_out;
350            while ($vap_out = <VAP>) {
351                $vap_out =~ s/^\s+//g; #remove any leading space
352                @result_params = split(/\s+/, $vap_out);
353                print "RESULT_PARAMS:\n@result_params\n";
354            }
355           
356            my $total_length=0;
357            my $file;
358            foreach $file (@all_files) { #loop over all files in directory to get total obs time.
359                my @tmp_params;
360                $vap_cmd = "$vap -n -c \"$params\" $file  | grep -v filename | grep -v -x \"\" |";
361                open(VAP,$vap_cmd);
362                while ($vap_out = <VAP>) {
363                    $vap_out =~ s/^\s+//g; #remove any leading space
364                    @tmp_params = split(/\s+/, $vap_out);
365                }
366                $total_length += $tmp_params[$params_LENGTH];
367            }
368
369            print "total length = $total_length\n";
370            $result_params[$params_LENGTH] = $total_length;
371           
372            populate_database(@result_params,$dir);
373
374        }   
375    }
376
377}
378
379#######################################################
380
381sub populate_database {
382   
383# Usage: populate_database(file, $result_params), where file has the full path to the file/directory in question,
384#   and $result_params have been filled with the vap results
385#
386#  This subroutine calculates all necessary parameters (such as data type, decimal positions, beam, file size, ...)
387#   and writes them to the database.
388
389    my @results = @_;
390    my $full_filename = $results[$#results];
391#    @results = shift @results;
392    print "full_filename = $full_filename\nRESULT_PARAMS (POP):\n@results\n";
393
394    my @dirs = split(/\//, $full_filename);
395    my $path = $dirs[0];
396    my $i;
397    for ($i=1; $i < $#dirs; $i++){
398        $path .= "/".$dirs[$i];
399    }
400    my $filename = $dirs[$#dirs];
401    print "path = $path, filename = $filename\n";
402   
403    if( $filename =~ /[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2}:[0-9]{2}:[0-9]{2}/ ) {
404        # Case of CPSR2 data file
405        $data_type = $CPSR2;   
406       
407        my @time_date = split('-', $filename);
408        $results[$params_DATE] = $time_date[0]."-".$time_date[1]."-".$time_date[2];
409        $results[$params_TIME] = $time_date[3];
410       
411        my $prefix = substr $results[0], 0, 1;
412        print "vap filename = $results[0] and the prefix = $prefix\n";
413
414        my $du = `du -sb $full_filename/$prefix*fb | awk 'BEGIN{sum=0}{sum+=\$1}END{print sum}'`;
415        print "Running du: -> du -sb $full_filename/$prefix*fb | awk 'BEGIN{sum=0}{sum+=\$1}END{print sum}'\n";
416        my @file_details = split(/\s+/, $du);
417        $file_size = $file_details[0];
418    }
419    else{
420        if ($filename =~ /^a.*\.rf/ || $filename =~ /^a.*\.cf/ ||
421            $filename =~ /^r.*\.rf/ || $filename =~ /^r.*\.cf/) {
422            $data_type = $DFB;
423        }
424        elsif ($filename =~ /^w.*\.rf/ || $filename =~ /^w.*\.cf/) {
425            $data_type = $WBC;
426        }
427        else {
428            $data_type = $NULLFIELD;
429        }
430       
431        my $ls = `ls -l $full_filename`;
432        my @file_details = split(/\s+/, $ls);
433        $file_size = $file_details[$FILE_SIZE_INDEX];
434    }
435
436    print "RESULT_PARAMS (POP 2):\n@results\n";
437   
438    #### Calculate the RA and Dec in decimal degrees.
439
440    ($raH,$raM,$raS) = split(':', $results[$params_RAJ]);
441    $rajd = ($raH + $raM/60. + $raS/3600.) * 15.;
442   
443    ($decD,$decM,$decS) = split(':', $results[$params_DECJ]);
444    $decjd = (abs($decD) + $decM/60. + $decS/3600.);
445    my @decstring = split(/ */,$results[$params_DECJ]);
446    if($decstring[0] eq '-'){
447        $decjd = -1. * $decjd;
448    }
449
450    ### Calculate the Galactic Longitude and latitude
451    $pi=asin(1) * 2.;
452    $pion180 = $pi/180.;
453    $NGP_RA = 192.859508 * $pion180; # location of NGP
454    $NGP_DEC= 27.128336 * $pion180;
455    $ASC_NODE=32.932;
456   
457    $draR = $rajd*$pion180 - $NGP_RA;
458    $decR = $decjd*$pion180;
459    $sinb = cos($decR) * cos($NGP_DEC) * cos($draR) + sin($decR) * sin($NGP_DEC);
460    $gb = asin($sinb); # this is the latitude, but in radians.
461   
462    $sinl = (sin($decR) * cos($NGP_DEC) - cos($decR) * cos($draR) * sin($NGP_DEC)) / cos($gb);
463    $cosl = cos($decR) * sin($draR) / cos($gb);
464   
465    # Need to get the correct quadrant, as this isn't preserved by
466    # atan, which returns angle between -90 and 90.
467    $gl_raw = atan($sinl/$cosl);
468    if($sinl > 0){
469        if($cosl > 0 ){ $gl = $gl_raw; }
470        else { $gl = $gl_raw + $pi; }
471    }
472    else {
473        if($cosl > 0){ $gl = $gl_raw + 2.*$pi; }
474        else{ $gl = $gl_raw + $pi; }
475    }
476    # Now put them into degrees.
477    $gb = $gb / $pion180;
478    $gl = ($gl / $pion180) + $ASC_NODE;
479   
480    print "rajd = $rajd, decjd=$decjd, gl=$gl, gb=$gb\n";
481
482
483    # Beam information
484    $bmaj = $results[$params_BMAJ];
485    $bmin = $results[$params_BMIN];
486    $bpa = $results[$params_BPA];
487    if($bmaj eq "UNDEF" || $bmaj eq "*" || $bmaj == 0.){
488        # could not get beam info from vap output.
489        # Need to calculate manually.
490        if( $results[$params_FREQ] != 0.){
491            # if the frequency is zero, do not calculate these...
492            $bmaj = (1.2*(299792458./($results[$params_FREQ] * 1.e6))/64.) / $pion180;
493            $bmin = (1.2*(299792458./($results[$params_FREQ] * 1.e6))/64.) / $pion180;
494            $bpa = 0.;
495        }
496        else{
497            $bmaj = $NULLFIELD;
498            $bmin = $NULLFIELD;
499            $bpa = $NULLFIELD;
500        }
501    }
502   
503    if($results[$params_NBITS] eq "UNDEF" || $results[$params_NBITS] == 0){
504        $results[$params_NBITS] = $NULLFIELD;
505    }
506    if($results[$params_TSAMP] eq "UNDEF" || $results[$params_TSAMP] == 0){
507        $results[$params_TSAMP] = $NULLFIELD;
508    }
509
510
511    # Dud things that aren't required for this set of data.
512    $survey = $NULLFIELD;
513    $nbeam = 1;
514       
515    # Clean up any NULL parameters
516    for ($i = 0; $i <= $#results; $i++) {
517        if ($results[$i] eq "N/A" || $results[$i] eq "UNDEF" ||
518            $results[$i] eq "INVALID" ||
519            $results[$i] eq "*error*" || $results[$i] eq "*") {
520
521            $results[$i] = $NULLFIELD;
522
523        }
524    }
525
526
527    # print out the list of parameters
528    print "filename = $filename\n";
529    print "src_name = " . $results[$params_NAME] . "\n";
530    print "projid = " . $results[$params_PROJID] . "\n";
531    print "raj = " . $results[$params_RAJ] . "\n";
532    print "dec = " . $results[$params_DECJ] . "\n";
533    print "data_type = $data_type\n";
534    print "freq = " . $results[$params_FREQ] . "\n";
535    print "bw = " . $results[$params_BW] . "\n";
536    print "scanlen = " . $results[$params_LENGTH] . "\n";
537    print "date = " . $results[$params_DATE] ."\n";
538    print "ut = " . $results[$params_TIME] ."\n";
539    print "MJD = " . $results[$params_MJD] ."\n";
540    print "rajd = " . $rajd ."\n";
541    print "decjd = " . $decjd ."\n";
542    print "gl = " . $gl ."\n";
543    print "gb = " . $gb ."\n";
544    print "bmaj = " . $bmaj ."\n";
545    print "bmin = " . $bmin ."\n";
546    print "bpa = " . $bpa ."\n";
547    print "dm = " . $results[$params_DM] . "\n";
548    print "period = " . $results[$params_PERIOD] . "\n";
549    print "nchan = " . $results[$params_NCHAN] . "\n";
550    print "npol = " . $results[$params_NPOL] . "\n";
551    print "nbin = " . $results[$params_NBIN] . "\n";
552    print "nsub = " . $results[$params_NSUB] . "\n";
553    print "tsamp = " . $results[$params_TSAMP] . "\n";
554    print "nbits = " . $results[$params_NBITS] . "\n";
555    print "nbeam = " . $nbeam . "\n";
556    print "cnfg = " . $results[$params_CNFG] . "\n";
557    print "inst = " . $results[$params_INST] . "\n";
558    print "rcvr = " . $results[$params_RCVR] . "\n";
559    print "hdrver = " . $results[$params_HDRVER] . "\n";
560    print "survey = " . $survey . "\n";
561    print "telescope = " . $results[$params_TELESCOP] . "\n";
562    print "site = " . $results[$params_SITE] . "\n";
563    print "obsrvr = " . $results[$params_OBSRVR] . "\n";
564    print "data_loc = $path\n";
565    print "file_size = $file_size bytes\n";
566
567
568
569}
570
571#######################################################
572
573sub isDuplicate {
574
575#DEBUGGING
576    return 0;
577
578}
579#######################################################
580
581sub connectdb {
582       
583        # Connect to the MySQL server
584#       my $dbh = DBI->connect("dbi:mysql:database=psrchive;host=localhost", "psrdba", "lighthouse")
585#       or die "Couldn't connect to database: $DBI::errstr\n";
586
587#       print "Successfully connected to db\n" if $dbh;
588
589# DEBUGGING     
590    print "We would normally connect to the DB here\n";
591    return 1;
592#       return $dbh;
593}
Note: See TracBrowser for help on using the repository browser.