Thursday, February 27, 2014

Avoiding XPath : Part V

(under construction; updated 4/16/2014)

Working with such large files in R has proved to be a real challenge for 4 GB RAM.  I am studying various methods to overcome some of the challeges, but also am considering just working in Postgres or RPostgresSQL.


Once EVTX logs are in CSV format, R has the potential to parse them effectively, although I believe I am one tricky use of 'reshape'  from  doing this in the script below.



#R 3.0 Code
library(data.table) # 'data.table'
library(reshape2)
# This 'flattens' the message field, pulling out returns and tabs:
setwd("C:/Windows/System32/winevt/Logs/")
# read.csv("Archive-Security-2012-02-05-23-16-29-596.evtx.csv", as.is=TRUE)
d <- read.csv("Archive-Security-2012-02-05-23-16-29-596.evtx.csv", as.is=TRUE)
h <- gsub('\n\t',' ',d$Message,fixed=TRUE)
h <- gsub('\n\n',' ',h,fixed=TRUE)
h <- gsub('\t\t',' ',h,fixed=TRUE)
h <- gsub('\n',' ',h,fixed=TRUE)
h <- gsub('\t',' ',h,fixed=TRUE)
h <- gsub('%%','',h,fixed=TRUE)
d$Message <- h

# This uses data.table and strsplit to produce a matrix of Message fields, although it doesn't do so completely:
setwd("C:/ps1/")
DF <- d
rm(d)
DT <- data.table(DF)
setkey(DT,RecordId)
f1 <- ": "
E5156 <- lapply((DT[Id=="5156"])$Message,strsplit,f1)
NR <- length(E5156)
# testing with 'NR <- 20'

# Write a table of the fields I want.
# del 5156 because the following line accumulates/append to that files value
for (i in 1:NR) {write.table((E5156[1:i][[i]][[1]][c(7:10)]),"5156",quote=TRUE, sep=",",col.names=FALSE,row.names=c("Src","SrcPort","Dst","DstPort"),append=TRUE)}

#Strip the table of words I don't want:
g <- read.table("5156",sep="\n")
h <- gsub("Protocol","",g$V1)
h <- gsub("Source Port","",h)
h <- gsub("Destination Address","",h)
h <- gsub("Destination Port","",h)

# some nifty use of 'reshape' is needed below I think:

> DF1 <- data.frame(h,check.names=FALSE)
> DF1
                                h
1            Src,192.168.200.126
2                  SrcPort,58136
3              Dst,192.168.200.1
4                     DstPort,53
5            Src,192.168.200.126
6                  SrcPort,52494
7              Dst,192.168.200.1
8                     DstPort,53
9            Src,192.168.200.126
10                 SrcPort,56348
11             Dst,192.168.200.1
12                    DstPort,53
13           Src,192.168.200.126
14                  SrcPort,1855
15              Dst,65.55.119.90
16                    DstPort,80
...

updated approach

# del 5156
for (i in 1:NR) {write.table((E5156[1:i][[i]][[1]][c(7:10)]),"5156",quote=TRUE, sep=",",col.names=FALSE,row.names=c("Src","SrcPort","Dst","DstPort"),append=TRUE)}
g <- read.table("5156",sep="\n")
h <- gsub("Protocol","",g$V1)
h <- gsub("Source Port","",h)
h <- gsub("Destination Address","",h)
h <- gsub("Destination Port","",h)
#DF1 <- data.frame(strsplit(h,","),row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
#DF1 <- data.frame(h,row.names=c("V1","V2"),col.names=c("Src","SrcPort","Dst","DstPort"))
DF1 <- data.frame(h,check.names=TRUE)

# The Time vector was added to help make the reshape function work.
DF1$Time <-Time <- (as.vector(sapply(1:(nrow(DF1)/4),rep,4)))
write.csv(DF1, file="DF1.csv",quote=FALSE)
# read.csv("DF1.csv")
DF1 <- read.csv("DF1.csv",col.names=c("Category","Address/Port","Time"))

> head(DF1,12)
   Category     Address.Port Time
1       Src 192.168.200.126     1
2   SrcPort           58136     1
3       Dst   192.168.200.1     1
4   DstPort              53     1
5       Src 192.168.200.126     2
6   SrcPort           52494     2
7       Dst   192.168.200.1     2
8   DstPort              53     2
9       Src 192.168.200.126     3
10  SrcPort           56348     3
11      Dst   192.168.200.1     3

12  DstPort              53     3
....




Probably working

library(reshape2)
dcast(DF1, Address.Port ~ Category)
acast(DF1, Address.Port ~ Category)
reshape2::recast(DF1, Address.Port ~ Category)
reshape::cast(DF1,  Address.Port ~ Category)
wide <- reshape(DF1,v.names="Address.Port", timevar="Time", idvar="Category", direction="wide")


library(lattice)
library(plyr)
graph<- (subset(DF1, Category == "Src",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
                Address.Port freq
1                       ::1     3
2           192.168.200.126    30
3           239.255.255.250     3
4 fe80::6172:6ecf:2d05:b0ae    18
5                   ff02::c     6;

graph<- (subset(DF1, Category == "Dst",select = Address.Port))
barchart((as.vector(graph$Address.Port)),stack=TRUE)
count(graph)
      Address.Port freq
1             ::1     6
2       127.0.0.1     3
3   192.168.200.1     9
4     224.0.0.252    15
5 255.255.255.255     3
6    65.55.119.90     3
7       ff02::1:3    15
8         ff02::c     6

No comments:

Post a Comment