Working with such large files in R has proved to be a real challenge for 4 GB RAM. I am studying various methods to overcome some of the challeges, but also am considering just working in Postgres or RPostgresSQL.
Once EVTX logs are in CSV format, R has the potential to parse them effectively, although I believe I am one tricky use of 'reshape' from doing this in the script below.
#R 3.0 Code
library(data.table) # 'data.table'
library(reshape2)
# This 'flattens' the message field, pulling out returns and tabs:
setwd("C:/Windows/System32/winevt/Logs/")
# read.csv("Archive-Security-2012-02-05-23-16-29-596.evtx.csv", as.is=TRUE)
d <- read.csv("Archive-Security-2012-02-05-23-16-29-596.evtx.csv", as.is=TRUE)
h <- gsub('\n\t',' ',d$Message,fixed=TRUE)
h <- gsub('\n\n',' ',h,fixed=TRUE)
h <- gsub('\t\t',' ',h,fixed=TRUE)
h <- gsub('\n',' ',h,fixed=TRUE)
h <- gsub('\t',' ',h,fixed=TRUE)
h <- gsub('%%','',h,fixed=TRUE)
d$Message <- h
# This uses data.table and strsplit to produce a matrix of Message fields, although it doesn't do so completely:
setwd("C:/ps1/")
DF <- d
rm(d)
DT <- data.table(DF)
setkey(DT,RecordId)
f1 <- ": "
E5156 <- lapply((DT[Id=="5156"])$Message,strsplit,f1)
NR <- length(E5156)
# testing with 'NR <- 20'
# Write a table of the fields I want.
# del 5156 because the following line accumulates/append to that files value
for (i in 1:NR) {write.table((E5156[1:i][[i]][[1]][c(7:10)]),"5156",quote=TRUE, sep=",",col.names=FALSE,row.names=c("Src","SrcPort","Dst","DstPort"),append=TRUE)}
#Strip the table of words I don't want:
g <- read.table("5156",sep="\n")
h <- gsub("Protocol","",g$V1)
h <- gsub("Source Port","",h)
h <- gsub("Destination Address","",h)
h <- gsub("Destination Port","",h)
# some nifty use of 'reshape' is needed below I think:
> DF1 <- data.frame(h,check.names=FALSE)
> DF1
h
1 Src,192.168.200.126
2 SrcPort,58136
3 Dst,192.168.200.1
4 DstPort,53
5 Src,192.168.200.126
6 SrcPort,52494
7 Dst,192.168.200.1
8 DstPort,53
9 Src,192.168.200.126
10 SrcPort,56348
11 Dst,192.168.200.1
12 DstPort,53
13 Src,192.168.200.126
14 SrcPort,1855
15 Dst,65.55.119.90
16 DstPort,80
...
updated approach
# del 5156
for (i in 1:NR) {write.table((E5156[1:i][[i]][[1]][c(7:10)]),"5156",quote=TRUE, sep=",",col.names=FALSE,row.names=c("Src","SrcPort","Dst","DstPort"),append=TRUE)}
g <- read.table("5156",sep="\n")
h <- gsub("Protocol","",g$V1)
h <- gsub("Source Port","",h)
h <- gsub("Destination Address","",h)
h <- gsub("Destination Port","",h)
#DF1 <- data.frame(strsplit(h,","),row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
#DF1 <- data.frame(h,row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
DF1 <- data.frame(h,check.names=TRUE)
# The Time vector was added to help make the reshape function work.
DF1$Time <-Time <- (as.vector(sapply(1:(nrow(DF1)/4),rep,4)))
write.csv(DF1, file="DF1.csv",quote=FALSE)
# read.csv("DF1.csv")
DF1 <- read.csv("DF1.csv",col.names=c("Category","Address/Port","Time"))
> head(DF1,12)
Category Address.Port Time
1 Src 192.168.200.126 1
2 SrcPort 58136 1
3 Dst 192.168.200.1 1
4 DstPort 53 1
5 Src 192.168.200.126 2
6 SrcPort 52494 2
7 Dst 192.168.200.1 2
8 DstPort 53 2
9 Src 192.168.200.126 3
10 SrcPort 56348 3
11 Dst 192.168.200.1 3
12 DstPort 53 3
....
Probably working
library(reshape2)
dcast(DF1, Address.Port ~ Category)
acast(DF1, Address.Port ~ Category)
reshape2::recast(DF1, Address.Port ~ Category)
reshape::cast(DF1, Address.Port ~ Category)
wide <- reshape(DF1,v.names="Address.Port", timevar="Time", idvar="Category", direction="wide")
library(lattice)
library(plyr)
graph<- (subset(DF1, Category == "Src",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
Address.Port freq
1 ::1 3
2 192.168.200.126 30
3 239.255.255.250 3
4 fe80::6172:6ecf:2d05:b0ae 18
5 ff02::c 6;
graph<- (subset(DF1, Category == "Dst",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
Address.Port freq
1 ::1 6
2 127.0.0.1 3
3 192.168.200.1 9
4 224.0.0.252 15
5 255.255.255.255 3
6 65.55.119.90 3
7 ff02::1:3 15
8 ff02::c 6
updated approach
# del 5156
for (i in 1:NR) {write.table((E5156[1:i][[i]][[1]][c(7:10)]),"5156",quote=TRUE, sep=",",col.names=FALSE,row.names=c("Src","SrcPort","Dst","DstPort"),append=TRUE)}
g <- read.table("5156",sep="\n")
h <- gsub("Protocol","",g$V1)
h <- gsub("Source Port","",h)
h <- gsub("Destination Address","",h)
h <- gsub("Destination Port","",h)
#DF1 <- data.frame(strsplit(h,","),row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
#DF1 <- data.frame(h,row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
DF1 <- data.frame(h,check.names=TRUE)
# The Time vector was added to help make the reshape function work.
DF1$Time <-Time <- (as.vector(sapply(1:(nrow(DF1)/4),rep,4)))
write.csv(DF1, file="DF1.csv",quote=FALSE)
# read.csv("DF1.csv")
DF1 <- read.csv("DF1.csv",col.names=c("Category","Address/Port","Time"))
> head(DF1,12)
Category Address.Port Time
1 Src 192.168.200.126 1
2 SrcPort 58136 1
3 Dst 192.168.200.1 1
4 DstPort 53 1
5 Src 192.168.200.126 2
6 SrcPort 52494 2
7 Dst 192.168.200.1 2
8 DstPort 53 2
9 Src 192.168.200.126 3
10 SrcPort 56348 3
11 Dst 192.168.200.1 3
12 DstPort 53 3
....
Probably working
library(reshape2)
dcast(DF1, Address.Port ~ Category)
acast(DF1, Address.Port ~ Category)
reshape2::recast(DF1, Address.Port ~ Category)
reshape::cast(DF1, Address.Port ~ Category)
wide <- reshape(DF1,v.names="Address.Port", timevar="Time", idvar="Category", direction="wide")
library(lattice)
library(plyr)
graph<- (subset(DF1, Category == "Src",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
Address.Port freq
1 ::1 3
2 192.168.200.126 30
3 239.255.255.250 3
4 fe80::6172:6ecf:2d05:b0ae 18
5 ff02::c 6;
graph<- (subset(DF1, Category == "Dst",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
Address.Port freq
1 ::1 6
2 127.0.0.1 3
3 192.168.200.1 9
4 224.0.0.252 15
5 255.255.255.255 3
6 65.55.119.90 3
7 ff02::1:3 15
8 ff02::c 6
No comments:
Post a Comment