In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py

from moztelemetry import get_pings, get_pings_properties
import datetime
In [2]:
sc.defaultParallelism
Out[2]:
16
In [5]:
pings = get_pings(sc, app="Firefox", channel="nightly", submission_date="20150716", schema="v4", doc_type="main", fraction=0.1)
In [6]:
pingDates = get_pings_properties(pings, ["creationDate"]).cache()
In [25]:
total = pingDates.count()
total
Out[25]:
21742
In [49]:
targetDate = datetime.date(2015, 7, 16)
def get_age(ping):
    pingDate = datetime.datetime.strptime(ping["creationDate"][:10], "%Y-%m-%d").date()
    return (max((targetDate - pingDate).days, 0), 1)
In [55]:
from operator import add
delays = pingDates.map(get_age).reduceByKeyLocally(add).items()
delays.sort(key=lambda i: i[0])
In [58]:
cumulative = 0
cumulative_list = []
for days, count in delays:
    cumulative += count
    cumulative_list.append((days, count, cumulative, cumulative / float(total)))
cumulative_list
Out[58]:
[(0, 15093, 15093, 0.6941863673995032),
 (1, 4487, 19580, 0.900561125931377),
 (2, 741, 20321, 0.9346426271732131),
 (3, 397, 20718, 0.9529022169073682),
 (4, 212, 20930, 0.9626529298132647),
 (5, 116, 21046, 0.9679882255542268),
 (6, 109, 21155, 0.9730015637935793),
 (7, 82, 21237, 0.9767730659552939),
 (8, 74, 21311, 0.9801766166865974),
 (9, 116, 21427, 0.9855119124275595),
 (10, 74, 21501, 0.988915463158863),
 (11, 72, 21573, 0.9922270260325637),
 (12, 60, 21633, 0.9949866617606475),
 (13, 63, 21696, 0.9978842792751357),
 (14, 26, 21722, 0.9990801214239721),
 (16, 2, 21724, 0.9991721092815748),
 (18, 2, 21726, 0.9992640971391776),
 (20, 1, 21727, 0.9993100910679791),
 (21, 1, 21728, 0.9993560849967804),
 (24, 1, 21729, 0.9994020789255819),
 (27, 1, 21730, 0.9994480728543832),
 (31, 1, 21731, 0.9994940667831846),
 (37, 1, 21732, 0.999540060711986),
 (52, 1, 21733, 0.9995860546407874),
 (79, 5, 21738, 0.9998160242847944),
 (80, 4, 21742, 1.0)]
In [61]:
df = pd.DataFrame(cumulative_list, columns=("days", "count", "cumulative_count", "cumulative_pct"))
df
Out[61]:
days count cumulative_count cumulative_pct
0 0 15093 15093 0.694186
1 1 4487 19580 0.900561
2 2 741 20321 0.934643
3 3 397 20718 0.952902
4 4 212 20930 0.962653
5 5 116 21046 0.967988
6 6 109 21155 0.973002
7 7 82 21237 0.976773
8 8 74 21311 0.980177
9 9 116 21427 0.985512
10 10 74 21501 0.988915
11 11 72 21573 0.992227
12 12 60 21633 0.994987
13 13 63 21696 0.997884
14 14 26 21722 0.999080
15 16 2 21724 0.999172
16 18 2 21726 0.999264
17 20 1 21727 0.999310
18 21 1 21728 0.999356
19 24 1 21729 0.999402
20 27 1 21730 0.999448
21 31 1 21731 0.999494
22 37 1 21732 0.999540
23 52 1 21733 0.999586
24 79 5 21738 0.999816
25 80 4 21742 1.000000
In [65]:
%pylab inline
plt.style.use('ggplot')
df.plot(y="cumulative_pct", x="days")
plt.title="Age of data in days"
Populating the interactive namespace from numpy and matplotlib