NETSretailerscode

NETSretailerscode

*****************************************************************************************
** NETS data cleaning for years 2000-2017 and all states using NETS2017 data
**
** created by May Kuo, tkuo@email.unc.edu
**
** July 6, 2021
**
*****************************************************************************************;

libname raw17 “<path for raw NETS2017 data>”;
libname datasets “<path for output dataset>”;

%macro clean (year, yr);
*****************************************************************************************************
Limit to NAICS codes 447110, 447190, 445120, 453991, 445110, 452311(old 452910), 446110,
445310, 452210 (old 452112), 452319 (old 452990) using 2017 xwalk to SIC
*****************************************************************************************************;

data clean1_&yr not_naics_&yr;
set raw17.nets2017 (keep= DunsNumber Company Address — ZIP4 FipsCounty — LevelCode sic2 — IndustryGroup
FIPS00 — YearStart MoveYears –DestZIP Minority WomenOwned–MoveOften FirstYear–CityCode_First);

if sic&yr=55419904 then naics=’447110′;
else if sic&yr=55410000 then naics=’447190′;
else if sic&yr=55419900 then naics=’447190′;
else if sic&yr=55419901 then naics=’447190′;
else if sic&yr=55419902 then naics=’447190′;
else if sic&yr=55419903 then naics=’447190′;
else if sic&yr=54110200 then naics=’445120′;
else if sic&yr=54110201 then naics=’445120′;
else if sic&yr=54110202 then naics=’445120′;

else if sic&yr=59930000 then naics=’453991′;
else if sic&yr=59939900 then naics=’453991′;
else if sic&yr=59939901 then naics=’453991′;
else if sic&yr=59939902 then naics=’453991′;
else if sic&yr=59939903 then naics=’453991′;
else if sic&yr=59939904 then naics=’453991′;

else if sic&yr=54110000 then naics=’445110′;
else if sic&yr=54110100 then naics=’445110′;
else if sic&yr=54110101 then naics=’445110′;
else if sic&yr=54110102 then naics=’445110′;
else if sic&yr=54110103 then naics=’445110′;
else if sic&yr=54110104 then naics=’445110′;
else if sic&yr=54110105 then naics=’445110′;
else if sic&yr=54119900 then naics=’445110′;
else if sic&yr=54119901 then naics=’445110′;
else if sic&yr=54119902 then naics=’445110′;
else if sic&yr=54119903 then naics=’445110′;
else if sic&yr=54119904 then naics=’445110′;
else if sic&yr=54119905 then naics=’445110′;

else if sic&yr=53999906 then naics=’452311′;

else if sic&yr=59120000 then naics=’446110′;
else if sic&yr=59129900 then naics=’446110′;
else if sic&yr=59129901 then naics=’446110′;
else if sic&yr=59129902 then naics=’446110′;

else if sic&yr=59210000 then naics=’445310′;
else if sic&yr=59210100 then naics=’445310′;
else if sic&yr=59210101 then naics=’445310′;
else if sic&yr=59210102 then naics=’445310′;
else if sic&yr=59219900 then naics=’445310′;
else if sic&yr=59219901 then naics=’445310′;

else if sic&yr=53119901 then naics=’452210′;
else if sic&yr=53310000 then naics=’452319′;
else if sic&yr=53990000 then naics=’452319′;
else if sic&yr=53999900 then naics=’452319′;
else if sic&yr=53999901 then naics=’452319′;
else if sic&yr=53999902 then naics=’452319′;
else if sic&yr=53999903 then naics=’452319′;
else if sic&yr=53999904 then naics=’452319′;
else if sic&yr=53999905 then naics=’452319′;
else if sic&yr=55319900 then naics=’452319′;
if naics ^=”” then output clean1_&yr;
else if sic&yr ^=. then output not_naics_&yr;
run;

*****************************************************************************************
** Add text search for tobacco stores from data not in the selected NAICS
*****************************************************************************************;

data tobacco_&yr tobacco_ck&yr;
set not_naics_&yr ;

if ( index(upcase(company),”TOBACCO”) >0 or index(upcase(company),”CIGARETTE”) >0 or
index(upcase(company),”VAP”) >0 ) then do;

array include (*) inc00-inc17;
array sic(*) sic00-sic17;
array naics_var(*) naics00-naics17;

**include spec SIC codes;
do e=1 to dim(sic);
if 53000000<= sic(e) <=53999999 then include(e)=1;
else if 54000000<= sic(e) <=54999999 then include(e)=1;
else if 55000000<= sic(e) <=55999999 then include(e)=1;
else if 56000000<= sic(e) <=56999999 then include(e)=1;
else if 57000000<= sic(e) <=57999999 then include(e)=1;
else if 58000000<= sic(e) <=58999999 then include(e)=1;
else if 59000000<= sic(e) <=59999999 then include(e)=1;
else if 70000000<= sic(e) <=70999999 then include(e)=1;
else if 78000000<= sic(e) <=78999999 then include(e)=1;
else if 79000000<= sic(e) <=79999999 then include(e)=1;

else if sic(e) >=0 then include(e)=0;
end;

end;

do k=1 to dim(include);
if include(k)=1 then tobacco_keep=1;
end;

if tobacco_keep=1 then output tobacco_&yr;
else output tobacco_ck&yr;
run;

*****************************************************************************************************
Identify Gas station stores
*****************************************************************************************************;

data type1_&yr ;
set clean1_&yr;

***Gas stations (with and without convenience): EXCLUDE Marine service stations (SIC: 55419902);
IF NAICS in (“447110” , “445120”, “447190”) then do;
if sic&yr=55419902 then delete;
end;
run;

***step 2_2: data cleaning following the methodology:
store type 2. tobacco stores;
data type2_&yr ;
set type1_&yr ;

**Tobacco stores: EXCLUDE Marijuana stores (SIC: 59939905);
if NAICS =”453991″ then do;
if sic&yr=59939905 then delete;
end;
run;

*****************************************************************************************************
Identify supermarket and grocery stores, exclude SIC codes: coops (54119901), delis (54119902),
frozen foods (54119903)
*****************************************************************************************************;
data type3_&yr ;
set type2_&yr;

if NAICS =”445110 ” then do;
if ( index(upCASE(company),”ALDI”)>0 or index(upCASE(company),”TJS”) >0 or index(upCASE(company),”TRADER JOES”) >0 or
index(upCASE(company),”TRADERJOES”) >0 or
index(upCASE(company),”WHOLE FOOD”) >0 or index(upCASE(company),”WHOLEFOOD”) >0 or
index(upcase(company), “SCHWANS HOME SERVICE”) >0 or
index(upcase(company), “FRESH and EASY NEIGHBORHOOD MARKET”) >0 or
index(upcase(company), “FRESH MARKET”) >0 or
index(upcase(company), “SPROUTS”) >0)then delete;

if sic&yr in (54119901, 54119902, 54119903) then delete;

if &year >=2009 and (index(upCASE(company),”WEGMANS”) >0 OR index(upCASE(company),”WEGMAN”) >0)
then delete;
if state=”CA” and &year >=2016 and index(upCASE(company),”RALEYS”) >0 then delete;
END;
run;

*****************************************************************************************************
Identify Pharmacies (446110)
*****************************************************************************************************;
data type5_&yr ;
set type3_&yr;

**From pharmacies, INCLUDE only: Walgreens, Rite Aid, Kmart Pharmacies, Eckerd, Longs Drug Stores,
New Albertsons, American Drug Stores, Bond Drug Company, Duane Reader, Medicine Shoppe,
Thrifty Payless Inc, Hook SuperX, Kerr Drug, Arbor Drug, CVS;

if naics=”446110″ then do;
**keep the following;
if ( index(upCASE(company), “WALGREEN”)>0 or index(upCASE(company),”RITE AID”) >0 or
index(upCASE(company), “KMART PHARMACIES”) >0 or index(upCASE(company),”KMART PHARMACY”) >0 or
index(upCASE(company), “K MART PHARMACY”) >0 or
index(upcase(company), “ECKERD”) >0 or index(upCASE(company),”ECKAERD”) >0 or
index(upcase(company), “LONGS DRUG”) >0 or index(upcase(company), “NEW ALBERTSONS”) >0 or
(index(upcase(company), “LONGS”) >0 and index(upcase(company), “DRUG”) >0) or
index(upcase(company), “LONGS PHARMACY”) >0 or
index(upcase(company), “AMERICAN DRUG”) >0 or Index(upcase(company), “BOND DRUG”) >0 or
index(upCASE(company), “DUANE READ”) >0 or index(upCASE(company), “MEDICINE SHOPPE”) >0 or
index(upCASE(company), “MEDICINE SHOP”) >0 or index(upCASE(company), “MEDICINE SHP”) >0 or
index(upCASE(company), “THRIFTY PAYLESS”) >0 or index(upcase(company), “HOOK SUPERX”) >0 ) or
upcase(compress(company,,”ps”)) =: “HOOKSUPERX” or
index(upCASE(company), “KERR DRUG”) >0 or index(upCASE(company), “ARBOR DRUG”) >0
then drugStore_keep=1;
if &year <=2014 and index(upCASE(company), “CVS”) >0 then drugStore_keep=1;
if drugStore_keep=. then drugStore_keep=0;
end;

***only keep specific stores;
IF naics=”446110″ and drugStore_keep^=1 then delete;
run;

***deal with pharmacy ban;
data type5a_&yr ck5_&yr;
set type5_&yr;

** exclude all pharmacies (446110), warehouse clubs/supercenters (452311), and Walmarts beginning
in the year following implementation of the ban ;

if upcase(compress(company,,”ps”)) =: “WALMART” then i_walmart=1;

if &year >=2009 and state=”CA” and city=”SAN FRANCISC” and naics=”446110″ then phban_keep=0;
if &year >=2009 and state=”MA” and city=”BOSTON” then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2010 and ((state=”CA” and city=”RICHMOND”) or (state=”MA” and city in (“NEWTON”, “NEEDHAM”, “UXBRIDGE”) )) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2011 and state=”MA” and city in (“EVERETT”, “SOUTHBOROUGH”) THEN do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2011 and state=”CA” and city=”SAN FRANCISC” then do;
if naics in (“446110”, “452311”) then phban_keep=0;
end;

if &year>=2012 and state=”MA” and city in (“CHATHAM”, “FALL RIVER”, “NEW BEDFORD”, “NORTH ATTLEB”, “HATFIELD”,
“CONCORD”, “LOWELL”, “SOMERVILLE”, “WAKEFIELD”, “WESTFORD”, “WINCHESTER”, “BROOKLINE”, “WELLESLEY”,
“WESTWOOD”, “WAREHAM”, “REVERE”, “OXFORD”, “WORCESTER”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2013 and state=”MA” and city in (“BREWSTER”, “HARWICH”, “LEE”, “LENOX”, “PITTSFIELD”, “STOCKBRIDGE”, “DARTMOUTH”, “FAIRHAVEN”,
“WESTPORT”, “HAVERHILL”, “MIDDLETON”, “SALEM”, “SAUGUS”, “BUCKLAND”, “MONTAGUE”, “SPRINGFIELD”, “ASHLAND” ,
“BEDFORD”, “MALDEN”, “MELROSE”, “READING”, “WATERTOWN”, “MIDDLEBOROUG”, “ROCHESTER”, “BARRE”, “FITCHBURG”,
“GARDNER”, “GRAFTON”, “LEOMINSTER”, “WEST BOYLSTO”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2014 and state=”MA” and city in (“BARNSTABLE”, “FALMOUTH”, “YARMOUTH”, “BERKLEY”, “EASTON”, “EDGARTOWN”, “GLOUCESTER”,
“LYNN”, “ROCKPORT”, “DEERFIELD”, “GILL”, “GREENFIELD”, “SHELBURNE”, “SUNDERLAND”, “WHATLEY”, “WEST SPRINGF”,
“AMHERST”, “ACTON”, “ARLIGHTON”, “BILLERICA”, “DRACUT”, “SUDBURY”, “TOWNSEND”, “DEDHAM”, “ABINGTON”,
“CHELSEA”, “ATHOL”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2014 and state=”MN” and fipscounty=27133 then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year >=2015 and state=”CA” and /*fipscounty=6041 or*/ city in (“BERKELEY”, “HEALDSBURG”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2015 and state=”MA” and city in (“ADAMS”, “SHEFFIELD”, “WILLIAMSTOWN”, “ANDOVER”, “BEVERLY”,
“HAMILTON”, “LAWRENCE”, “NEWBURYPORT”, “ORANGE”, “BRIMFIELD”, “EASTHAMPTON”, “GRANBY”, “NORTHAMPTON”,
“SOUTH HADLEY”, “HUDSON”, “NORTH READIN”, “SHERBORN”, “STONEHAM”, “WAYLAND”, “BRAINTREE”, “COHASSET”,
“MEDWAY”, “MILTON”, “STOUGHTON”, “WEYMOUTH”, “BRIDGEWATER”, “CARVER”, “DUXBURY”, “HULL”, “MARION”,
“SCITUATE”, “BOLTON”, “HUBBARDSTON”, “TEMPLETON”, “WESTBOROUGH”, “WINCHENDON”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2016 and state=”CA” and city in (“HOLLISTER”, “DALE CITY”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2016 and state=”MA” and city in (“EASTHAM”, “LANESBOROUGH”, “TISBURY”, “DANVERS”, “CHARLEMONT”, “LEVERETT”,
“WESTFIELD”, “SOUTHAMPTON”, “CAMBRIDGE”, “MARLBOROUGH”, “WILMINGTON”, “MEDFIELD”, “MILLIS”, “WALPOLE”,
“HALIFAX”, “MARSHFIELD”, “NORWELL”, “ASHBURNHAM”, “MENDON”, “NORTHBOROUGH”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

if &year>=2017 and state=”MA” and city in (“BOURNE”, “ORLEANS”, “NORTH ADAMS”, “ESSEX”, “HOLYOKE”, “LUDLOW”,
“BELCHERTOWN”, “HADLEY”, “MAYNARD”, “TYNGSBOROUGH”, “WESTON”, “HOLBROOK”, “BROCKTON”, “WHITMAN”,
“HOPEDALE”) then do;
if naics in (“446110”, “452311”) then phban_keep=0;
else if i_walmart=1 then phban_keep=0;
end;

IF phban_keep=0 then OUTPUT ck5_&yr;
else output type5a_&yr;

RUN;

*****************************************************************************************************
Identify Beer, wine and liquor stores (445310)
*****************************************************************************************************;
data type6_&yr;
set type5a_&yr;

** EXCLUDE state run liquor stores and wine stores that don’t sell tobacco in states that control liquor or wine.
Exclude stores with names that contain: “state”, “abc”, “board”, “ctrl”, “alcoholic”, “bev”, “beverage”,
“dept”, “county”, “bd”, “control”;

if naics=’445310′ then do;

***create indicator of name in the specified words;

**state;
f2=findw(company,’state’, ‘ -/’, ‘i’);
***ABC;
f3=findw(company,’abc’, ‘ -/’, ‘i’);
f3A=findw(company,’a b c’, ‘ -/’, ‘i’);

***control;
f5=findw(company,’ctrl’, ‘ -/’, ‘i’);
f12=findw(company,’control’, ‘ -/’, ‘i’);
f12a=findw(company,’controll’, ‘ -/’, ‘i’);

***agency;
f18=findw(company,’agency’, ‘ -/’, ‘i’);
** NEW HAMPSHIRE LIQUOR;
f20=findw(company, ‘NEW HAMPSHIRE LIQUOR’,’ -/’, ‘i’);

 

if state=”AL” and sum(of f2, f3, F3a, f5, f12, f12a) >0 then drop=1;
else if state=”ID” and findw(company,’state liquor’, ‘ -/’, ‘i’) >0 then drop=1;
else if state=”ME” and &year in (2000, 2001, 2002, 2003) and (findw(company,’state liquor’, ‘ -/’, ‘i’) >0 or sum (of f5, f12, f12a)> 0) then drop=1;
else if state=”NC” and (f3>0 or f3a >0) then drop=1;
else if state=”PA” and sum (of f5, f12, f12a) >0 then drop=1;
else if state=”UT” and sum (of f5, f12, f12a, f18) >0 then drop=1;
else if state=”VA” and (sum (of f3,f3a) >0 or findw(company,’ALCOHOLIC BEV CTRL’, ‘ -/’, ‘i’) >0)
then drop=1;
else if state=”WA” and 2000<= &year <= 2012 and sum (of f5, f12, f12a, f18, f2) >0 then drop=1;
else if state=”MA” and fipscounty=24031 then drop=1;
else if state=”NH” and f20 >0 and f18 <=0 then drop=1;

end;
***keep records that are not identified to be dropped;
if drop ^=1;
run;

*****************************************************************************************************
Identify Discount department store (452112 & 452990): INCLUDE Walmart, Kmart, Meijer Shopko, 99 cents stores,
Family Dollar, Dolgencorp/Dollar General, military exchanges.
*****************************************************************************************************;

data type7_&yr;
set type6_&yr;

IF NAICS in (“452210” “452319”) then do;

if upcase(compress(company,,”ps”)) =: “WALMART” then keep=1;
else if upcase(compress(company,,”ps”)) = “KMART” then keep=1;

else if index(upcase(company),”MEIJER”) >0
or index(upcase(company), “SHOPKO”) >0 or index(upcase(company),”99 CENTS STORE”) >0 or
index(upcase(company), “ALCO STORES”) >0 or index(upcase(company), “FRED STORES”) >0 then keep=1;

if &year >=2013 and index(upcase(company),”FAMILY DOLLAR”) >0 then keep=1;
if &year >=2014 and (index(upcase(company), “DOLGENCORP”) >0 or
index(upcase(company),”DOLLAR GENERAL”)>0 or index(upcase(company), “DOLLAR TREE”)>0) then keep=1;

if (index(upcase(company),”AAFES”) >0 or index(upcase(company),”PX”) >0 or
index(upcase(company),”BX”) >0 or index(upcase(company),”POST EXCHANGE”) >0 or
index(upcase(company), “BASE EXCHANGE”) >0) then keep=1;

if keep =. then keep=0;
end;
if NAICS in (“452210” “452319”) and keep ^=1 then delete;;
run;

*****************************************************************************************************
Combine data from NAICS code and text search tobacco stores
*****************************************************************************************************;
data datasets.keep_&yr;
set type7_&yr tobacco_&yr;
***create include indicator in each year for analysis;

include_&yr=1;
drop naics– tobacco_keep;
run;

proc sort ;
by DunsNumber;
run;
%mend;

%clean (2000, 00)
%clean (2001, 01)
%clean (2002, 02)
%clean (2003, 03)
%clean (2004, 04)
%clean (2005, 05)
%clean (2006, 06)
%clean (2007, 07)
%clean (2008, 08)
%clean (2009, 09)
%clean (2010, 10)
%clean (2011, 11)
%clean (2012, 12)
%clean (2013, 13)
%clean (2014, 14)
%clean (2015, 15)
%clean (2016, 16)
%clean (2017, 17)

;

*****************************************************************************************************
Remove the stores with pharmacy ban, address is missing, and duplicates, in 2011-2017
*****************************************************************************************************;
%macro remove2 (file, year, yr);
***identify stores to remove due to pharmacy ban from GIS analysis output;

PROC IMPORT OUT= &file._&yr DATAFILE= “P:\Projects\ICISS_Ribisl_Retailers\GIS\Join_&year..xlsx”
DBMS=xlsx REPLACE;
SHEET=”Join_&year”;
GETNAMES=YES;
RUN;

data &file.2_&yr;
set &file._&yr ;
if NAMELSAD ^=””;
format dunsnumber best12.;
keep dunsnumber NAMELSAD;
run;
proc sort data=&file.2_&yr;
by dunsnumber;
run;

data clean_&yr;
merge datasets.keep_&yr (in=a) &file.2_&yr (in=b);
by dunsnumber;
if a and not b;
drop NAMELSAD;
run;

**Drop stores with missing addresses or PO box address;
proc sql;
create table clean2_&yr as
select *
from clean_&yr
where not (missing(address) or upper(compress(address,,”ps”)) contains “POBOX”)
;quit;

**remove duplicates:
(1). When 7 or more stores appear at the same address, remove all of stores (these are likely corporate headquarters and not actually stores)
(2). When 6 or less stores appear at same address check to see if stores have same name, NAICS, first and last year
(must have all 4 along with same address), and if so, remove the duplicates (keep one of the duplicate stores).
Otherwise keep all stores when 6 or less at same address.;

data clean2r_&yr;
set clean2_&yr;

**create address_id;
address_id=upcase(compress(address,,”ps”))||upcase(compress(city,,”ps”))||upcase(compress(state,,”ps”))||upcase(compress(ZipCode,,”ps”));
run;

proc sort;
by address_id;
run;

data nodup_&yr (drop=address_id n) dup7_&yr (keep=address_id) dup6_&yr (keep=address_id);
set clean2r_&yr;
by address_id;
if first.address_id then n=0;
n+1;
if last.address_id then do;
if n=1 then output nodup_&yr;
else if 1< n <=6 then output dup6_&yr;
else if n>=7 then output dup7_&yr;
end;
run;

proc sql;
create table dup6_&yr.r as
select *
from dup6_&yr as a
left join clean2r_&yr as b
on a.address_id=b.address_id;
quit;

***only keep one record when there are duplicates in address, SIC, FirstYear, and LastYear;
proc sort data=dup6_&yr.r out=dup6_&yr.r2 nodupkey;
by address_id company sic&yr firstyear LastYear;
run;

***combine nodup dataset and dup6_r2;
data final_&yr;
set nodup_&yr dup6_&yr.r2;
drop address_id;
run;
proc sort;
by dunsnumber;
run;
%mend;

%remove2 (area, 2011, 11);
%remove2 (area, 2012, 12);
%remove2 (area, 2013, 13);
%remove2 (area, 2014, 14);
%remove2 (area, 2015, 15);
%remove2 (area, 2016, 16);
%remove2 (area, 2017, 17);

*****************************************************************************************************
Also exclude stores with missing address duplicates in 2000-2010
*****************************************************************************************************;
%macro drop2 (yr);
**Drop stores with missing addresses or PO box address;
proc sql;
create table clean2_&yr as
select *
from datasets.keep_&yr
where not (missing(address) or upper(compress(address,,”ps”)) contains “POBOX”)
;quit;

**Remove duplicates:
(1). When 7 or more stores appear at the same address, remove all of stores (these are likely corporate headquarters and not actually stores)
(2). When 6 or less stores appear at same address check to see if stores have same name, NAICS, first and last year
(must have all 4 along with same address), and if so, remove the duplicates (keep one of the duplicate stores).
Otherwise keep all stores when 6 or less at same address.;

data clean2r_&yr;
set clean2_&yr;

**create address_id;
address_id=upcase(compress(address,,”ps”))||upcase(compress(city,,”ps”))||upcase(compress(state,,”ps”))||upcase(compress(ZipCode,,”ps”));
run;
proc sort;
by address_id;
run;

data nodup_&yr (drop=address_id n) dup7_&yr (keep=address_id) dup6_&yr (keep=address_id);
set clean2r_&yr;
by address_id;
if first.address_id then n=0;
n+1;
if last.address_id then do;
if n=1 then output nodup_&yr;
else if 1< n <=6 then output dup6_&yr;
else if n>=7 then output dup7_&yr;
end;
run;

*** add address_id back to the raw data for checking data elements;
proc sql;
create table dup6_&yr.r as
select *
from dup6_&yr as a
left join clean2r_&yr as b
on a.address_id=b.address_id;
quit;
***only keep one record in these address dup stores when there are duplicates in address, SIC, FirstYear, and LastYear;
proc sort data=dup6_&yr.r out=dup6_&yr.r2 nodupkey;
by address_id company sic&yr firstyear LastYear;
run;

***combine nodup dataset and dup6_r2;
data final_&yr;
set nodup_&yr dup6_&yr.r2;
drop address_id;
run;
proc sort;
by dunsnumber;
run;

%mend;

%drop2 (00)
%drop2 (01)
%drop2 (02)
%drop2 (03)
%drop2 (04)
%drop2 (05)
%drop2 (06)
%drop2 (07)
%drop2 (08)
%drop2 (09)
%drop2 (10)
;

*****************************************************************************************************
**Combine data for all years
*****************************************************************************************************;

data temp_0017;
merge final_00 – final_17;
by DunsNumber;
array yr (*) include_00 – include_17;
do i=1 to dim(yr);
if yr(i)=. then yr(i)=0;
end;

drop i;

**create NAICS variable fro each year;

array sic(*) sic90-sic99 sic00-sic17;
array naics(*) naics90 -naics99 naics00-naics17;

do j=1 to dim(sic);

if sic(j)=55419904 then naics(j)=’447110′;
else if sic(j)=55410000 then naics(j)=’447190′;
else if sic(j)=55419900 then naics(j)=’447190′;
else if sic(j)=55419901 then naics(j)=’447190′;
else if sic(j)=55419902 then naics(j)=’447190′;
else if sic(j)=55419903 then naics(j)=’447190′;
else if sic(j)=54110200 then naics(j)=’445120′;
else if sic(j)=54110201 then naics(j)=’445120′;
else if sic(j)=54110202 then naics(j)=’445120′;

else if sic(j)=59930000 then naics(j)=’453991′;
else if sic(j)=59939900 then naics(j)=’453991′;
else if sic(j)=59939901 then naics(j)=’453991′;
else if sic(j)=59939902 then naics(j)=’453991′;
else if sic(j)=59939903 then naics(j)=’453991′;
else if sic(j)=59939904 then naics(j)=’453991′;

else if sic(j)=54110000 then naics(j)=’445110′;
else if sic(j)=54110100 then naics(j)=’445110′;
else if sic(j)=54110101 then naics(j)=’445110′;
else if sic(j)=54110102 then naics(j)=’445110′;
else if sic(j)=54110103 then naics(j)=’445110′;
else if sic(j)=54110104 then naics(j)=’445110′;
else if sic(j)=54110105 then naics(j)=’445110′;
else if sic(j)=54119900 then naics(j)=’445110′;
else if sic(j)=54119901 then naics(j)=’445110′;
else if sic(j)=54119902 then naics(j)=’445110′;
else if sic(j)=54119903 then naics(j)=’445110′;
else if sic(j)=54119904 then naics(j)=’445110′;
else if sic(j)=54119905 then naics(j)=’445110′;

else if sic(j)=53999906 then naics(j)=’452311′;

else if sic(j)=59120000 then naics(j)=’446110′;
else if sic(j)=59129900 then naics(j)=’446110′;
else if sic(j)=59129901 then naics(j)=’446110′;
else if sic(j)=59129902 then naics(j)=’446110′;

else if sic(j)=59210000 then naics(j)=’445310′;
else if sic(j)=59210100 then naics(j)=’445310′;
else if sic(j)=59210101 then naics(j)=’445310′;
else if sic(j)=59210102 then naics(j)=’445310′;
else if sic(j)=59219900 then naics(j)=’445310′;
else if sic(j)=59219901 then naics(j)=’445310′;

else if sic(j)=53119901 then naics(j)=’452210′;
else if sic(j)=53310000 then naics(j)=’452319′;
else if sic(j)=53990000 then naics(j)=’452319′;
else if sic(j)=53999900 then naics(j)=’452319′;
else if sic(j)=53999901 then naics(j)=’452319′;
else if sic(j)=53999902 then naics(j)=’452319′;
else if sic(j)=53999903 then naics(j)=’452319′;
else if sic(j)=53999904 then naics(j)=’452319′;
else if sic(j)=53999905 then naics(j)=’452319′;
else if sic(j)=55319900 then naics(j)=’452319′;
end;
drop j ;
run;

*****************************************************************************************************
Add store type indicator into the dataset
*****************************************************************************************************;

data datasets.cleaned_0017;
set temp_0017;

** should assign sic and naics code to missing if the stores are not met in our inclusion criteria;

array include (*) include_00 – include_17;
array naics2 (*) naics00-naics17;
array sicvar2 (*) sic00-sic17;

do j=1 to dim(include);
if include(j) ^=1 then do;
naics2(j)=””;
sicvar2(j)=.;
end;
end;

length store_type90 – store_type99 store_type00- store_type17 $15.;

array sic(*) sic90-sic99 sic00-sic17;
array naics(*) naics90 -naics99 naics00-naics17;
array type(*) store_type90 -store_type99 store_type00-store_type17;

do i=1 to dim(naics);
If naics(i) in (“447110”, “445120”, “447190”) then type(i)=”Gas/convenience”;
else If naics(i) = “453991” then type(i)=”Tobacco”;
else If naics(i) = “445110” then type(i)=”Grocery”;
else If naics(i) = “452311” then type(i)=”Supercenter”;
else If naics(i) = “446110” then type(i)=”Pharmacy”;
else If naics(i) = “445310” then type(i)=”Alcohol”;
else If naics(i) in (“452210”, “452319”) then type(i)=”Discount/general”;
Else if sic(i) ^=”” then type(i)=”Tobacco”;
end;

drop i j;

run;