Documenting variable and value labeling across a lot of data files

Working with cross-national survey data can be frustrating because of, among other things, incommensurability among surveys’ variable naming and value coding. Multi-national survey programs such as the Demographic and Health Surveys help provide consistent sampling designs, questionnaires and on-the-ground implementation — their work is helpful, indeed.

Inconsistencies are, for a number of reasons, essentially unavoidable. I’ve found that the way one often finds them is by discovering a problem when proceeding with an analysis that assumes all will go well. Doing better, it turns out, doesn’t require so much effort.

Stata’s built-in data documentation commands, -describe- and -codebook- among them, produce nice human-readable reports of variable and value names. They’re no good for making comparisons across lots of data files, however. Wouldn’t it be nice to have a way to easily compare, across files, the variable names, value labels, and perhaps some rough descriptive across a more or less arbitrary set of files?

Here ya go, in less than a hundred lines:

set trace off
clear mata
/*
Citation: Odden, Colin. 2015. Datadoc [Computer Program]. Columbus, OH.
Uses: -filelist-, Robert Picard, picard@netbox.com
Thanks to: Przemek Powalko for identifying three errors, one fatal (now fixed)!
This program will, given the -filelist- routine (install from SSC)
 and some parameters,
 (1) traverse a directory containing    Stata data files (subdirectories are
     OK, as -filelist- works recursively),
 (2) record, one variable per line, the filename, variable name, variable label,
     name of the value label applied to the variable, a list of the levels of
     the variable and their associated labels, and some misc descriptives.
          ... yielding a final file _vardesc.txt that you can import into Stata.
    Note that this is simply automating a very tedious data exploration task.
*/
clear
set maxvar 32767

// Dear user: set the following parameters
local job                    "dhs" // filename prefix -- can be arbitrary but honor your OS' filename restrictions
local dir_base                "/Users/admin/Documents/data" // must be writeable
local dir_sourcedata        "`dir_base'/DHS/backup"    // change to your data directory
local skipvars ""        // do you want to skip any variables, such as a caseid?
local file_filelist         "`dir_base'/`job'_filelist.dta"
local file_vardesc            "`dir_base'/`job'_vardesc.txt"
local file_vardesc_excluded    "`dir_base'/`job'_vardesc_pruned.dta"

// begin
tempname outfile

qui which filelist
if _rc==111 {
	cap noi ssc install filelist
}
filelist, dir("`dir_sourcedata'") pat(*.dta) save("`file_filelist'") replace

use `file_filelist', clear
file open `outfile' using `file_vardesc', write replace
file write `outfile' "fname" _tab "vname" _tab "type" _tab "varlab" ///
	_tab "vallab" _tab "lvls" _tab "quantity" _tab "val_minimum" _tab ///
	"val_p50" _tab "val_max" _n
local obs=_N
forvalues i=1/`obs' {
	use `file_filelist' in `i', clear
	local fname = filename[1]
	local f = dirname + "/" + filename
	use "`f'", clear
	numlabel, add
	foreach v of varlist _all {
		di "`f' - `v'"
		local thatlvl "|||" // nonsense placeholder
		local typ: type `v'
		local varlbl : variable label `v'
		local b `"`"'
		local varlbl : subinstr local varlbl "`b'" ""
		local varlbl : subinstr local varlbl "`b'" "" // why twice? This was a personal issue with a Peru DHS.
		local mlvls // initialize empty
		local lbl : value label `v'
		file write `outfile' "`fname'" _tab "`v'" _tab "`typ'" _tab `"`varlbl'"' _tab `"`lbl'"' _tab
		if !inlist("`v'","`skipvars'") {
			local lbl : value label `v'
			local lvls // initialize empty
			cap levelsof `v', local(lvls)
			local numlvls : word count `lvls'
			if `numlvls' < 40 { // why 40? Seemed right at the time.
 				foreach lvl of local lvls { // each level of that variable
 					cap local thislvl : label `lbl' `lvl'
 					if _rc==0 {
 						local thislvl : subinstr local thislvl "`b'" ""
 						if `"`thislvl'"'!=`"`thatlvl'"' {
 							local mlvls `"`mlvls'"' `"||`thislvl'"'
 						}
 					}
 					local thatlvl = `"`thislvl'"'
 				} // end each level
 				cap noi file write `outfile' `"`mlvls'"'
 			} // end if numlvls<40
 		} // end if !inlist
 		cap sum `v', d
 		file write `outfile' _tab "`r(N)'"
 		if `r(N)'>0 {
			file write `outfile' _tab "`r(min)'" _tab "`r(p50)'"  _tab "`r(max)'"
		}
		file write `outfile' "" _n
		local mlvls
		local typ
		local varlbl
	} // end each var
	local fname
	local dirname
	local f
} // end each file
file close `outfile'

Note that without documentation explaining how consistency was achieved, it is impossible to know how much inconsistency has been glossed by data producers pre-release. That is, apparent consistency to the secondary data analyst is not evidence of consistent production.

Leave a Reply

Your email address will not be published. Required fields are marked *