In [1]:
import pandas as pd import numpy as np
In [23]:
cols = ['time', 'cd4', 'age', 'packs', 'drugs', 'sex', 'cesd', 'id'] cd4Data = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/cd4.data', header=None, names=cols, skipinitialspace=True, delimiter=' ')
In [34]:
cd4Data = cd4Data.ix[cd4Data['time'].order().index] cd4Data.index = cd4Data.index + 1 cd4Data.head()
Out[34]:
time cd4 age packs drugs sex cesd id 1279 -2.989733 814 6.17 3 1 5 -3 30183 2190 -2.989733 400 -6.02 0 0 3 -4 41406 1167 -2.984257 467 13.94 0 1 1 0 30046 1427 -2.956879 749 -4.54 0 1 -1 -7 30498 2032 -2.951403 1218 5.57 3 1 5 3 41032In [111]:
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1) # average from 2 before until 2 after (window size = 5) plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 5, center=True), linewidth=2);
In [112]:
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1) # average from 10 before until 10 after (window size = 21) plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 21, center=True), linewidth=2);
In [113]:
plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1) # average from 400 before until 400 after (window size = 401) plot(cd4Data['time'], pd.rolling_mean(cd4Data['cd4'], 401, center=True), linewidth=3);
In [114]:
def tukey(x): return max(1 - x**2, 0) ** 2
In [117]:
ws = 10. # this is just for plotting, not really useful in the moving average later filt = map(tukey, np.arange(-ws, ws+1)/(ws + 1)) filt = filt/sum(filt) plot(np.arange(-ws, ws+1), filt, 'ok');
In [118]:
# here we'd like to define a 'reducer' function for the rolling_apply # that is, a function that returns a scalar given a vector def filt(x): # determining ws is a bit tricky, # because x should contain the whole window ws = (len(x) - 1) / 2. f = map(tukey, np.arange(-ws, ws+1)/(ws + 1)) f = f/sum(f) return sum(f * x)
In [119]:
# ws=100 means 100 before until 100 after --> window size of 201 window = 201. plot(cd4Data['time'], cd4Data['cd4'], 'ok', markersize=1) # weighted moving average with the filt function plot(cd4Data['time'], pd.rolling_apply(cd4Data['cd4'], window, filt, center=True), linewidth=3);
In [152]:
# no loess implementation in pandas/statsmodels, and lowess implementation is somehow broken for this particular data from statsmodels.nonparametric.smoothers_lowess import lowess #lowess_line = lowess(cd4Data['cd4'], cd4Data['time'])
In [154]:
# the splines support is pretty basic also; R's regression with splines is just übercool
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4