共计 2028 个字符,预计需要花费 6 分钟才能阅读完成。
本篇内容介绍了“proc sql 语句在数据清洗中的运用”的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让丸趣 TV 小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!
libname clean c:/books/clean /* 定义永久性数据库 */
* 新建一个样本数据 one;
data one;
input X Y Z;
datalines;
1 2 3
101 202 303
44 55 66
444 555 666
;
title Values of X from data set ONE where X is greater than 100
/* 有条件的选择观测值 */
proc sql;
select X
from one
where X gt 100;
quit;
*Program 8-2;
*** 检查无效的字符型的数值;
title Checking for Invalid Character Data
proc sql;
select Patno,
Gender,
DX,
AE
from clean.patients
where Gender not in (M , F ,) or
notdigit(trim(DX))and not missing(DX) or
AE not in (0 , 1 ,
quit;
* 检查无效的数值型的数值;
title Checking for out-of-range numeric values
proc sql;
select Patno,
HR,
SBP,
DBP
from clean.patients
where HR not between 40 and 100 and not missing(HR) or
SBP not between 80 and 200 and not missing(SBP) or
DBP not between 60 and 120 and not missing(DBP);
quit;
* 基于标准差利用简单的算法来检查数值;
title Data values beyond two standard deviations
proc sql;
select Patno,
SBP
from clean.patients
having SBP not between mean(SBP) – 2 * std(SBP) and
mean(SBP) + 2 * std(SBP) and
SBP is not missing;
quit;
* 检查缺失值;
options linesize=84;
title Observations with missing values
proc sql;
select *
from clean.patients
where Patno is missing or
Gender is missing or
Visit is missing or
HR is missing or
SBP is missing or
DBP is missing or
DX is missing or
AE is missing;
quit;
* 检查日期;
title Dates before June 1, 1998 or after October 15, 1999
proc sql;
select Patno,
Visit
from clean.patients
where Visit not between 01jun1998 d and 15oct1999 d and
Visit is not missing;
quit;
* 检查重复值;
title Duplicate Patient Numbers
proc sql;
select Patno,
Visit
from clean.patients
group by Patno
having count(Patno) gt 1;
quit;
* 识别对应多个观察值的变量;
title Listing of patients who do not have two visits
proc sql;
select Patno,
Visit
from clean.patients2
group by Patno
having count(Patno) ne 2;
quit;
* 检查两个文件中对应要求的序列号 ID;
data one;
input Patno X Y;
datalines;
1 69 79
2 56 .
3 66 99
5 98 87
12 13 14
;
data two;
input Patno Z;
datalines;
1 56
3 67
4 88
5 98
13 99
;
* 两个文件都不含的 ID;
title Patient numbers not in both files
proc sql;
select One.patno as ID_one,
Two.patno as ID_two
from one full join two
on One.patno eq Two.patno
where One.patno is missing or Two.patno is missing;
quit;
“proc sql 语句在数据清洗中的运用”的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注丸趣 TV 网站,丸趣 TV 小编将为大家输出更多高质量的实用文章!