Variable exploration in many files, cont’d

In “Getting Correspondence from messy surveys” I took a stab at finding variables that are likely what I’m looking for. As the list of data files and my list of search terms grew, I found I needed something a little more extensible, and more general, than my ad hoc scripts.

Below is the result. Here’s what it does. Taking as parameters

  • (a) a path containing your Stata data files of interest (subfolders are OK),
  • (b) a list of strings for which you want matching variables (that is, whose variable and/or value labels match), and
  • (c) [OPTIONAL] a list if strings whose matches you want to exclude,

the program:

(1) Uses the excellent -filelist- program written by Robert Picard to traverse the directory set in (a), and all its subfolders, creating a Stata data file containing filenames and full paths, one file per row,

(2) Opens each file identified in (1) and creates an output file, one row for each file, with a set of new variables one for each string matched, containing the names of variables returned by -lookfor-,

(3) Using the file created in (2) examines each of files found by (1) and, in a new file that has one record per variable in each file (that is, if two variables match in file A and three variables in file B, the result of this step will be a file with five rows) for each of the strings, matched,  one row per variable in each file

(4) Removes records from the file created in (3) whose variable label matches any of the strings specified in (c), writing a new file containing the reduced set.

Clearly these operations could be completed in fewer steps. I of course welcome you to make refinements and would be grateful if you shared them (with me, especially). I also use Stata’s -file- commands because I love plain text files. All of this could be done with a set of local macros plus -generate- and -append-.


/*
Citation: Odden, Colin. 2015. EZ Varfinder [Computer Program]. Columbus, OH.
Uses: -filelist-, Robert Picard, picard@netbox.com

This program will, given the -filelist- routine (install from SSC)
	and some parameters,
	(1) traverse a directory containing	Stata data files (subdirectories are
		OK, as -filelist- works recursively),
	(2) for each file finds variables whose labels match a list of search
		strings (via -lookfor-) in `str_tomatch',
	(3) produces a file-variable file (one line per variable in each data file)
		containing variable name, variable label and value labels.
	(4) Removes from the file created in (3) variables matching a string
		specified in `str_exclusions',
	... yielding a final file _vardesc_pruned.dta
	
	Note that this is simply automating a very tedious data exploration and
		management task.
*/

// Dear user: set the following parameters
local job "mics"	// filename prefix -- can be arbitrary
local steps "1,2,3,4" // the steps below will run if included in this list
local dir_base "/Users/admin/WYRKBACK/data" // this directory must be writeable
local dir_sourcedata `dir_base'/unzipped/	// change to your data directory
local str_tomatch "matri marri marital union desir pref wait more want inten ideal another contr" local str_exclusions "illness alone AIDS lazy virus violence passport vaccin medica NHIS servic infect" 

// You can leave these alone, or modify names as needed:
local file_filelist `dir_base'/`job'_filelist.dta
local file_varsfound = "`dir_base'/`job'_varsfound.txt"
local file_varsfounddta = "`dir_base'/`job'_varsfound.dta"
local file_vardesc = "`dir_base'/`job'_vardesc.txt"
local file_vardesc_excluded  = "`dir_base'/`job'_vardesc_pruned.dta"

// begin
tempname outfile

if inlist(1,`steps') { // step 1
	cap noi ssc install filelist
	filelist, dir("`dir_sourcedata'") pat("*.dta") save("`file_filelist'") replace
} // end step 1

if inlist(2,`steps') { // step 2
	file open `outfile' using "`file_varsfound'", write replace
	file write `outfile' "fname" _tab "fullpath"
	foreach l of local str_tomatch { // write out header row
		file write `outfile' _tab "`l'"
	}
	file write `outfile' _n
	use "`file_filelist'", clear
	local obs = _N
	forvalues i=1/`obs' {
		foreach l of local str_tomatch { // initialize some blank locals
			local `l'
		} // end foreach
		qui use "`file_filelist'" in `i', clear
		local fname = filename[1]
		qui local f = dirname + "/" + filename
		qui use "`f'", clear
		file write `outfile' "`fname'" _tab "`f'"
		foreach l of local str_tomatch {
			local looklist_`l' "X " // cheap trick to work around empty spaces
			qui lookfor `l'
			local looklist_`l' `looklist_`l'' `r(varlist)'
			file write `outfile' _tab "`looklist_`l''"
		}
		file write `outfile' _n
	}
	file close `outfile'
} // end step 2

if inlist(3,`steps') { // step 3
	file open `outfile' using `file_vardesc', write replace
	file write `outfile' "filename" _tab "varname" _tab "matched" _tab "varlab" _tab "vallab" _n
	import delimited "`file_varsfound'", varnames(1) clear
	local obs=_N
	save "`file_varsfounddta'", replace
	forvalues i=1/`obs' {
		qui use "`file_varsfounddta'" in `i', clear // one row at a time
		local fname = fname[1]
		local fullpath = fullpath[1]
		local matches // initialize empty
		foreach look of local str_tomatch { // each matched string
			local `look'_match_tmp = `look'[1]
			local `look'_match_tmp = subinstr("``look'_match_tmp'","X","",.) // strip X
			local `look'_match = subinstr("``look'_match_tmp'","  "," ",.) // strip double space
			if strpos("``look'_match'"," ")==1 {
				local `look'_match = subinstr("``look'_match'"," ","",1)
			}
			if "``look'_match'"!="" { // append `look' if ``look'_match' not empty
				local matches `matches' `look'
			}
		}
		use "`fullpath'", clear
		cap numlabel, add
		di "`matches'"
		set trace on
		foreach match of local matches { // each matched string
			di "``match'_match'"
			foreach var_l of local `match'_match { // each variable found

				local varlbl : variable label `var_l'

				local mlvls // initialize empty
				
				local lbl : value label `var_l'
													di "MADE IT THIS FAR"
				qui levelsof `var_l', local(lvls)
				foreach lvl of local lvls { // each level of that variable
						cap local f`lvl' : label `lbl' `lvl'
						local mlvls `mlvls' `f`lvl''
				} // end each level
				file write `outfile' "`fname'" _tab "`var_l'" _tab "`match'" _tab "`varlbl'" _tab "`mlvls'" _n
			} // end each variable
		} // end each matched string
	} // end each survey
	file close `outfile'
} // end step 3

if inlist(4,`steps') { // step 4
	import delimited "`file_vardesc'", varnames(1) clear
	di "Before pruning exclusions: " _N
	foreach stop of local str_exclusions {
		drop if strpos(varlab,"`stop'")>0
	}
	di "After pruning exclusions: " _N
	save "`file_vardesc_excluded'", replace
} // end step 4
// FIN

Leave a Reply

Your email address will not be published. Required fields are marked *