You've used the trends
package to reveal a few, simple properties of trends in random sequences:
Let's explore Trends
and Trendlists
a little more.
You should be able to find the longest trend in a random sequence easily enough:
You can do the first with trends.decompose()
, and the last with Python's max()
.
How do you find the length of a Trend
object?
Easy: with its length
attribute, Trend.length
.
Here's that code.
import math
from random import random
import trends
def mean_longest_trend(nrands, trials):
total_max_lengths = 0
for _ in range(trials):
seq = [random() for _ in range(nrands)]
trnds = trends.decompose(seq)[0]
lengths = [trnd.length for trnd in trnds]
total_max_lengths += max(lengths)
return total_max_lengths // trials
Try it out a few times.
mean_longest_trend(100, 100)
Look to see how it varies with the length of the sequence.
def longest_by_sequence_length(max, trials, points):
interval = max // points
longest = {}
for nrands in range(1, max, interval):
longest[nrands] = mean_longest_trend(nrands, trials)
return longest
longests = longest_by_sequence_length(100_000, 100, 10)
longests
Seems like a nice graph would help. First, just steal code from the earlier tutorial.
from matplotlib import pyplot as plt
import numpy as np
def slope_and_intercept(x, y):
# fit a least-squares line
m, b = np.polyfit(x, y, deg=1)
# coefficient of determination = correlation ** 2
return (m, b)
def legend(m, b, r):
equation = f"y = {m:2.2f}*x + {b:2.2f}"
rho_sq = "\u03c1**2" # Unicode character for rho is U+03C1. Sadly, the font lacks superscripts.
goodness_of_fit = f"{rho_sq} = {r**2:2.3f}"
legend = f"{equation}\n{goodness_of_fit}"
return legend
def annotate_graph(x, y):
# add in a best-fit line and a legend
m, b = slope_and_intercept(x, y)
r = np.corrcoef(x, y)[0,1]
# create nparray that spans the x-space
xseq = np.linspace(0, math.ceil(x[-1]), num=100)
# create a legend
graph_legend = legend(m, b, r)
# add best-fit line and legend to the plot
plt.plot(xseq, m * xseq + b, color="red", lw=1.5) # best-fit line
plt.text(0, y[-2], graph_legend, color="red") # legend in the urh corner
def graph(x, y, x_label, y_label, title):
# Create a scatterplot with a title and axis labels
plt.figure(figsize=(10, 10))
plt.scatter(x, y, s=10, alpha=0.7)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
annotate_graph(x, y) # add the legend and the best-fit line
Having defined those, you can graph your data.
seq_lengths = list(longests.keys())
longest_trends = list(longests.values())
graph(seq_lengths, longest_trends,
x_label="Sequence Length",
y_label="Mean Longest Trend",
title="Longest Trend as a Function of Sequence Length")
Sweet, right?
And what's this tell you?
The longest trend typically stretches out to nearly two-thirds of the sequence. That seems surprising, but it's useful to build your intuition.
The whole sequence is just random floats between zero and one, so for a sequence of any length, the sequence mean will be $0.5$.
Intuitively, since you know the longest trend is also really long -- over half the whole sequence, you'd guess that it, too, should have that average. Does it?
It's easy to tweak the previous code slightly, to report both the length and the average of the longest trend?
Objects in the class trends.Trend
have two attributes: length
and average
.
Here's how to use these both to collect and then to summarize data.
def longest_trend(nrands):
# return the longest trend in a random sequence
seq = [random() for _ in range(nrands)]
trnds = trends.decompose(seq)[0] # decompose into trends
lengths = [trnd.length for trnd in trnds] # list all trend lengths
longest = lengths.index(max(lengths)) # find which one's biggest
return trnds[longest] # and return that trend, with both attributes.
longest_trend(1000)
Now use that to describe the typical longest trend.
def typical_longest_trend(nrands, trials):
sum_averages = 0
sum_lengths = 0
for _ in range(trials):
longest = longest_trend(nrands)
sum_averages += longest.average
sum_lengths += longest.length
return trends.Trend(round(sum_averages/trials, 3), length=round(sum_lengths/trials))
typical_longest_trend(10_000, 10)
You know trend means decrease monotonically, so the first trend has the biggest mean, and the last one, the smallest. How big and small are they? Use the same approach you used in the last section.
def trend_at(position, nrands):
# return the trend at a particular position in a random sequence
seq = [random() for _ in range(nrands)]
trnds = trends.decompose(seq)[0] # decompose into trends
return trnds[position] # and return that trend, with both attributes.
A position
parameter means you won't have to write a separate function for first and last.
FIRST = 0
LAST = -1
trend_at(FIRST, 10_000) # the first trend
trend_at(LAST, 10_000) # the last trend
def typical_trend_at(position, nrands, trials):
sum_averages = 0
sum_lengths = 0
for _ in range(trials):
trnd = trend_at(position, nrands)
sum_averages += trnd.average
sum_lengths += trnd.length
return trends.Trend(round(sum_averages/trials, 3), length=round(sum_lengths/trials))
typical_trend_at(FIRST, 10_000, 100)
typical_trend_at(LAST, 10_000, 100)
Now watch how it varies with sequence length:
def typical_trend_by_seq_length(position, max, trials, points):
interval = max // points
typicals = {}
for nrands in range(1, max, interval):
typicals[nrands] = typical_trend_at(position, nrands, trials)
return typicals
Here's the typical first trend:
typical_trend_by_seq_length(FIRST, 100_000, 100, 10)
And the typical final trend:
typical_trend_by_seq_length(LAST, 100_000, 100, 10)
Lengths vary a lot, but the means are pretty stable: the first trend's mean is about 2/3, and the last is about 1/3.
Even short random sequences show these properties.
typical_trend_by_seq_length(0, 20, 1000, 20)
typical_trend_by_seq_length(-1, 20, 1000, 20)
Sequences with as few as 10 random floats have first and last trends with means of around 2/3 and 1/3.
For perspective, take a look at the longest trend in short sequences.
def typical_longest_trend_by_seq_length(max, trials, points):
interval = max // points
typicals = {}
for nrands in range(1, max, interval):
typicals[nrands] = typical_longest_trend(nrands, trials)
return typicals
typical_longest_trend_by_seq_length(20, 1000, 20)
With this perspective, let's look at all the Trends in a Trendlist.
There are only about $ln(sequence_length)$ Trends, so you could even take a very long sequence, say a million floats, and just take a quick peek. Remember, decomposing a sequence of a million floats only takes about 4 seconds!
By running the cell below, repeatedly, you can see a lot of variation. (Each run is only a single trial.)
seq = [random() for _ in range(1_000_000)]
trendlist = trends.decompose(seq)[0]
trendlist
The middle trends tend to be long, with means near 1/2, with lengths shortening sharply near each end, usually accompanied by big changes in the means.
So far, you've summarized this by looking at a "typical" trend from each end and a "typical" longest trend.
Another kind of summary is the conventional mean/variance pair.
The trends
package supplies a pair of methods, Trendlist.lengths()
and Trendlist.averages()
that make this easy.
You will also want mean()
and variance()
from Python's statistics
package.
import statistics
import collections
import trends
def basic_stats(lst):
Stats = collections.namedtuple("Stats", "mean variance")
return Stats(mean = statistics.mean(lst), variance = statistics.variance(lst))
def trendlist_stats(nrands)
seq = [random() for _ in range(1_000_000)]
trendlist = trends.decompose(seq)[0]
trend_lengths = trendlist.lengths()
trend_aves = trendlist.averages()
lengths = Stats(mean=statistics.mean(trend_lengths),
variance=statistics.variance(trend_lengths))
aves = Stats(mean=statistics.mean(trend_aves),
variance=statistics.variance(trend_aves))
return (lengths, aves)
trendlist_stats(100_000)
You know the average number of hiccups in a random sequence will be about $ln(N)$
How much will that vary? You'll want a couple of functions from Python's standard statistics
package for that:
mean()
and variance()
from statistics import mean, variance
import collections
def basic_stats(lst):
Stats = collections.namedtuple("Stats", "mean variance")
return Stats(mean = statistics.mean(lst), variance = statistics.variance(lst))
def trendlist_hic_stats(nrands, trials):
trendlist_hiccups = []
for _ in range(trials):
seq = [random() for _ in range(nrands)]
trendlist_hiccups.append(len(trends.decompose(seq)[0])-1)
return basic_stats(trendlist_hiccups)
trendlist_hic_stats(100_000, 1000)
def trend_count(nrands, trials):
return [ntrends(nrands) for _ in range(trials)]
trend_count(1000, 10)
plt.figure(figsize=(10, 10))
plt.hist(trend_count(1000, 1000))
plt.title("How Many Trends?")
plt.xlabel("Number of Trends")
plt.ylabel("Count")
Not bad, but looks log-normal-ish. Try a second time, using the logs of the counts.
import numpy
log_trend_counts = numpy.log(trend_count(1_000, 10_000))
plt.hist(log_trend_counts)
plt.title("How Many Trends?")
plt.xlabel("Ln(Number of Trends)")
plt.ylabel("Count")
The normality test from scipy
import scipy
# scipy normality test
stat, p = scipy.stats.normaltest(log_trend_counts)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
from statsmodels.graphics.gofplots import qqplot
hiccups = np.array(trend_count(10_000, 10_000)) - 1
qqplot(np.log(hiccups), line='s')
plt.show()