In [1]:

from pyspark import SparkContext

if 'sc' not in locals():
  sc = SparkContext("local")

In [18]:

rdd = sc.parallelize(range(1000), 10)
print(rdd.mean())

499.5

In [19]:

data = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89]
distData = sc.parallelize(data)
res = distData.reduce(lambda a, b: a + b)
print(res)

In [20]:

from pyspark.sql.types import *
header = 'animal,name'
schema = StructType([StructField(colname, StringType(), True) for colname in header.split(',')])
pets = spark.read.schema(schema).csv('gs://BUCKET_NAME/unstructured/pets.txt')

pets.createOrReplaceTempView('pets')
countsByPet = spark.sql('SELECT animal, COUNT(*) from pets GROUP BY animal')

print(countsByPet.take(10))

[Row(animal=u'Frog', count(1)=2), Row(animal=u'Pig', count(1)=3), Row(animal=u'Cat', count(1)=5), Row(animal=u'Dog', count(1)=6)]

In [21]:

file = sc.textFile("gs://BUCKET_NAME/unstructured/pets.txt")

pets = file.map(lambda s: s.split(",")).map(lambda x : (x[0], [x[1]]))
petsByType = pets.reduceByKey(lambda a, b: a + b)
print(petsByType.take(10))

[(u'Frog', [u'Kermit', u'Hoppy']), (u'Pig', [u'Bacon', u'Babe', u'Tasty']), (u'Dog', [u'Noir', u'Bree', u'Pickles', u'Sparky', u'Gigi', u'Fred']), (u'Cat', [u'Tom', u'Alley', u'Cleo', u'George', u'Suzy'])]

In [ ]: