from pyspark import SparkContext
if 'sc' not in locals():
sc = SparkContext("local")
rdd = sc.parallelize(range(1000), 10)
print(rdd.mean())
499.5
data = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89]
distData = sc.parallelize(data)
res = distData.reduce(lambda a, b: a + b)
print(res)
231
from pyspark.sql.types import *
header = 'animal,name'
schema = StructType([StructField(colname, StringType(), True) for colname in header.split(',')])
pets = spark.read.schema(schema).csv('gs://BUCKET_NAME/unstructured/pets.txt')
pets.createOrReplaceTempView('pets')
countsByPet = spark.sql('SELECT animal, COUNT(*) from pets GROUP BY animal')
print(countsByPet.take(10))
[Row(animal=u'Frog', count(1)=2), Row(animal=u'Pig', count(1)=3), Row(animal=u'Cat', count(1)=5), Row(animal=u'Dog', count(1)=6)]
file = sc.textFile("gs://BUCKET_NAME/unstructured/pets.txt")
pets = file.map(lambda s: s.split(",")).map(lambda x : (x[0], [x[1]]))
petsByType = pets.reduceByKey(lambda a, b: a + b)
print(petsByType.take(10))
[(u'Frog', [u'Kermit', u'Hoppy']), (u'Pig', [u'Bacon', u'Babe', u'Tasty']), (u'Dog', [u'Noir', u'Bree', u'Pickles', u'Sparky', u'Gigi', u'Fred']), (u'Cat', [u'Tom', u'Alley', u'Cleo', u'George', u'Suzy'])]