import urllib2
import tensorflow as tf
import numpy as np
import findspark
findspark.init()
from pyspark import SparkContext, SparkFiles, SQLContext
if not 'sc' in locals():
sc = SparkContext()
if not 'sqlContext' in locals():
sqlContext = SQLContext(sc)
wordsDF = sqlContext.createDataFrame([('cat',), ('elephant',), ('rat',), ('rat',), ('cat', )], ['word'])
wordCountsDF = wordsDF.groupBy("word").count()
wordCountsDF.show()
x_data = np.random.rand(100).astype(np.float32)
y_data = x_data * 0.1 + 0.3
W = tf.Variable(tf.random_uniform([1], -1.0, 1.0))
b = tf.Variable(tf.zeros([1]))
y = W * x_data + b
loss = tf.reduce_mean(tf.square(y - y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(loss)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for step in range(201):
sess.run(train)
if step % 20 == 0:
print(step, sess.run(W), sess.run(b))
+--------+-----+ | word|count| +--------+-----+ | rat| 2| | cat| 2| |elephant| 1| +--------+-----+ (0, array([ 0.69630218], dtype=float32), array([ 0.00331201], dtype=float32)) (20, array([ 0.28020507], dtype=float32), array([ 0.21137346], dtype=float32)) (40, array([ 0.15433531], dtype=float32), array([ 0.2732774], dtype=float32)) (60, array([ 0.11638315], dtype=float32), array([ 0.29194263], dtype=float32)) (80, array([ 0.10493983], dtype=float32), array([ 0.29757056], dtype=float32)) (100, array([ 0.10148945], dtype=float32), array([ 0.2992675], dtype=float32)) (120, array([ 0.10044909], dtype=float32), array([ 0.29977915], dtype=float32)) (140, array([ 0.10013542], dtype=float32), array([ 0.2999334], dtype=float32)) (160, array([ 0.10004086], dtype=float32), array([ 0.29997993], dtype=float32)) (180, array([ 0.10001232], dtype=float32), array([ 0.29999396], dtype=float32)) (200, array([ 0.10000371], dtype=float32), array([ 0.29999819], dtype=float32))