Dask 数组协调许多 Numpy 数组,在网格内排列成块。它们支持 Numpy API 的很大一部分。
Dask 数组使用阻塞算法提供了一个并行的、大于内存的 n 维数组。简单地说:分布式 Numpy
并行:使用计算机上的所有内核
大于内存:通过将数组分解成许多小块,按顺序操作这些块以最大限度地减少计算的内存占用,并有效地从磁盘流式传输数据,从而让您可以处理大于可用内存的数据集.
阻塞算法:通过执行许多较小的计算来执行大型计算
# 连接/创建 Dask集群
from dask.distributed import Client
client = Client(n_workers=4, threads_per_worker=1)
client
Client
|
Cluster
|
针对一个有10亿个随机数的大数组做加法,分而治之
dask使用chunks参数将大数组切分成小块,先针对小块数组进行计算,然后将小块计算的结果再计算,最后得到结果
import dask.array as da
x = da.random.random((1_000_000_000,), chunks=(1_000_000,))
x
|
result = x.sum()
result
|
%time result.compute()
CPU times: user 2.3 s, sys: 246 ms, total: 2.55 s Wall time: 3.94 s
499998636.25802165
%%time
# numpy运行时间
import numpy as np
x = np.random.random((1_000_000_000,))
x.sum()
CPU times: user 10.4 s, sys: 5.79 s, total: 16.2 s Wall time: 16.9 s
499997790.0394625
import numpy as np
import dask.array as da
x = da.random.normal(10, 0.1, size=(20000, 20000),
chunks=(1000, 1000))
x
|
y = x.mean(axis=0)[::100]
y
|
%%time
y.compute()
CPU times: user 1.44 s, sys: 227 ms, total: 1.67 s Wall time: 3.85 s
array([ 9.99938546, 9.99971967, 9.99945469, 10.00064526, 9.99973264, 9.99962011, 9.99943414, 10.00153313, 10.00006829, 9.9997891 , 10.00022229, 10.0000093 , 10.00028218, 9.99916192, 10.00037475, 9.99818902, 9.99933171, 10.00078998, 9.99950032, 10.00068982, 10.00030546, 9.99955212, 10.00072705, 9.9992586 , 10.00186248, 10.00029279, 10.00080394, 10.00114623, 10.00060563, 9.99967053, 9.99989176, 9.99997171, 9.99912124, 10.00074451, 10.0003222 , 9.99986045, 9.99945197, 9.99923763, 9.99958515, 9.99886138, 10.00146995, 9.99947912, 9.99910516, 10.0005532 , 10.0011445 , 9.99997232, 10.00026516, 9.9996024 , 9.99981061, 10.00036216, 10.00044439, 9.99971844, 10.00013288, 9.99933069, 10.00047139, 10.0002093 , 10.0003827 , 9.99924716, 9.99981332, 10.00164159, 9.99972159, 10.00005599, 9.99859007, 10.00030065, 10.00181599, 9.99982092, 9.99857654, 9.99867371, 10.00005552, 10.00050094, 9.9995026 , 10.00028719, 9.99988604, 9.99908824, 9.99939675, 10.00063213, 10.00000387, 9.99894784, 9.99937416, 10.00073037, 9.99961319, 10.00072846, 9.99926124, 10.00029055, 10.00021993, 9.99992964, 10.00047872, 10.00038963, 10.00051796, 10.00038888, 10.00033705, 10.00108346, 9.9990602 , 9.99979862, 9.99863401, 9.99921999, 10.00007962, 9.99874498, 10.00008739, 9.99915782, 10.00043744, 9.99935347, 10.00094777, 10.00016539, 10.00066644, 10.00054307, 10.00071952, 10.00141817, 9.9998111 , 10.00078602, 9.99904868, 10.00056427, 10.00035578, 9.9998022 , 10.00067736, 10.00062118, 9.99891199, 10.00037441, 9.99926147, 10.00016703, 9.99858298, 9.9992852 , 10.00084714, 9.99990607, 9.99935402, 9.99954703, 9.99895847, 9.99956195, 10.00031746, 10.00062184, 9.99915744, 10.00031887, 9.99986441, 9.99979198, 10.0005084 , 9.99961863, 10.00007088, 10.00032256, 9.99904252, 10.000044 , 10.00048375, 10.00085924, 10.00022151, 10.00067298, 9.99904039, 10.00017115, 10.00102031, 9.99970683, 10.00035228, 10.00059024, 10.00059687, 9.99972138, 10.00017235, 9.99986814, 9.99866281, 10.00053003, 10.00019852, 10.00097625, 9.99922938, 10.00005178, 10.00076521, 10.00000208, 10.00033751, 10.00106182, 9.99918381, 9.99870024, 9.99959843, 10.00079557, 10.00001793, 9.99921751, 10.00106094, 10.00005635, 9.99922579, 10.00010568, 10.00217666, 10.00052796, 10.00026221, 9.99970776, 10.00073637, 9.99995403, 10.00053143, 10.00058177, 9.99914715, 9.9999082 , 9.99875414, 10.00070716, 10.00093805, 10.00110325, 9.99960993, 9.99963664, 10.00086654, 9.99940247, 9.99873781, 9.9999873 , 9.99899916, 10.0004872 , 10.00001297, 9.99967598, 10.00015494, 9.99935123])
import numpy as np
import dask.array as da
x = da.random.normal(10, 0.1, size=(20000, 20000),
chunks=(20000, 20000))
x
|
y = x.mean(axis=0)[::100]
%time y.compute()
CPU times: user 681 ms, sys: 74.4 ms, total: 756 ms Wall time: 11.4 s
array([10.0001655 , 10.00167621, 9.99923351, 10.00064734, 10.00091432, 10.00047372, 10.00017035, 9.99953996, 9.99968177, 9.99857899, 10.00003809, 10.00108116, 9.99900785, 10.00027567, 9.99999848, 10.00082341, 10.00129353, 10.00030403, 10.00066804, 9.99966315, 9.99924217, 10.00144793, 10.00020324, 10.00086305, 10.00061989, 10.00084032, 9.99839684, 10.00006433, 9.99977552, 10.00069765, 9.99952445, 10.00001131, 10.00008032, 9.99987475, 9.99992465, 9.99989691, 9.99953196, 10.00119205, 9.99926572, 10.00054107, 10.00015775, 10.0002149 , 9.9994878 , 9.99994616, 9.99979669, 10.00072395, 9.99848976, 10.00013652, 9.99915904, 9.99975547, 9.99919632, 10.00161185, 9.99997359, 10.00021391, 9.99981143, 9.99902299, 9.99928706, 9.99987627, 9.99995705, 9.99941156, 9.99964318, 10.00022528, 9.99921117, 10.00056046, 9.99894001, 9.99892346, 9.99943849, 10.00010026, 9.99940094, 9.99957174, 9.99949583, 9.99995406, 10.00018958, 9.99908376, 9.99930929, 10.00043657, 9.99963325, 9.99881886, 9.99974896, 10.00013545, 9.99901375, 9.99937319, 9.99769809, 9.99949054, 10.00071288, 9.99963838, 9.99990762, 10.00008735, 9.99950625, 10.00014241, 9.99959867, 9.99946053, 9.99988741, 9.99995468, 10.00035638, 10.0019213 , 10.00053414, 9.99868919, 9.99984201, 10.00114261, 9.99869325, 9.99927222, 9.99915365, 10.0012004 , 9.99997232, 9.99980688, 9.99996744, 9.99942185, 9.99883205, 9.99902766, 10.0005657 , 10.00052677, 10.00082233, 10.00092042, 9.99858897, 9.99987929, 9.99963113, 9.99864403, 9.99984894, 10.00040831, 10.0009665 , 10.00073402, 9.99870406, 9.99914026, 10.00088796, 10.0008127 , 9.99969023, 9.99992695, 10.00003188, 10.00080844, 10.00032147, 10.00060291, 9.99956115, 10.00050113, 10.00035191, 10.00095687, 10.00015963, 10.00018535, 10.00004299, 10.00066757, 10.00005719, 10.00033769, 10.0013274 , 9.99848195, 10.00034481, 10.00017613, 9.99928225, 10.00034121, 9.99887689, 9.99962716, 9.99966392, 9.99916587, 10.00140038, 9.99869036, 10.0017865 , 9.99933645, 10.00047066, 10.0009407 , 10.00005309, 10.00043234, 9.99996189, 10.00033439, 9.99870855, 9.999809 , 9.99930438, 9.99981149, 9.99937361, 10.00029258, 10.00067756, 10.00059611, 10.00083188, 10.00048247, 10.00042629, 9.99930732, 10.00077203, 10.00010038, 10.00076279, 9.99959813, 10.00028271, 9.99965757, 9.99966286, 9.99965236, 9.99873209, 9.99948903, 9.99983362, 9.99983121, 10.00172939, 10.00032148, 9.99946019, 10.0004037 , 10.00039504, 9.99967973, 9.99987867, 9.99998475, 10.0006764 , 9.99993125, 9.99982043, 9.99956422, 9.99969551, 9.999642 ])
%%time
# numpy运算时间
import numpy as np
x = np.random.normal(10, 0.1, size=(20000, 20000))
x.mean(axis=0)[::100]
CPU times: user 10.5 s, sys: 811 ms, total: 11.3 s Wall time: 10.9 s
array([ 9.99922827, 10.00026646, 9.99968943, 10.00016506, 9.99994516, 10.00061448, 9.99965795, 9.99985347, 9.99886232, 10.00058193, 10.00030265, 9.9998264 , 9.99979297, 9.99964179, 9.99929195, 10.00065933, 9.99967126, 9.99955685, 9.99986639, 9.99907462, 10.0005034 , 10.00054594, 9.99936214, 9.99982118, 9.99991934, 9.99913666, 9.99982351, 9.99973586, 9.99888925, 10.00027446, 9.99964081, 10.00068381, 10.00094463, 9.99963125, 9.99938888, 10.00004541, 10.00254318, 10.00053102, 9.99977105, 9.99977276, 9.99957297, 9.99988611, 10.00120295, 10.000437 , 9.99979155, 9.99967142, 10.00020331, 10.00116383, 10.00033801, 10.00021838, 10.00012446, 9.99951103, 9.99916995, 9.99902008, 10.00034422, 10.00001313, 9.99980868, 10.00037233, 10.0009992 , 9.99919649, 9.99879175, 10.00140597, 9.99973817, 10.00015065, 9.99917851, 9.99909349, 10.00018382, 10.00004312, 10.00032093, 9.99977143, 10.00038807, 9.99986658, 9.99935603, 10.00016754, 10.00025407, 9.99935126, 10.00169764, 9.99970532, 9.9998238 , 9.99998858, 10.00092713, 9.99955296, 10.00007319, 9.99958857, 10.00045426, 9.99931314, 9.99896041, 10.00026229, 9.99905469, 9.99995499, 10.00042923, 9.99993847, 10.00013055, 10.0008367 , 9.99910667, 10.00087811, 9.99941823, 9.99918678, 9.99986536, 9.99932546, 9.99918451, 9.99866245, 10.00033373, 10.0004227 , 10.00097181, 10.00074508, 9.99957039, 10.00097936, 10.00110915, 9.99863335, 9.99941846, 10.00078151, 10.00135763, 9.99974906, 10.00036343, 10.00020633, 9.99998518, 9.9998611 , 10.00020738, 9.99965312, 10.00068624, 9.99941021, 10.00055716, 10.00033612, 10.00070974, 9.99979137, 10.00037914, 10.00017834, 9.99915816, 10.00061095, 10.00051719, 10.00002044, 10.00025175, 10.00084379, 10.00119243, 10.00138352, 9.99981327, 9.99877617, 10.00079836, 10.00060858, 10.00022235, 10.00037564, 9.99949906, 9.99914096, 10.0002461 , 10.00060548, 9.99881911, 10.00186338, 9.99972676, 9.99978963, 10.00026371, 10.00029151, 9.99889633, 10.00062594, 9.99943762, 9.99966152, 10.00025012, 9.9994344 , 9.99999353, 9.99877645, 10.0016442 , 10.00088683, 10.00078212, 10.00028935, 10.00051296, 9.99993713, 9.99886598, 10.00046 , 10.00026318, 10.00036496, 10.00042037, 9.99959924, 10.00037002, 10.00034273, 10.00163502, 9.99976336, 10.00000983, 9.9998691 , 10.00005712, 10.00002127, 10.00002954, 10.0004849 , 9.99969409, 9.99944515, 9.99931209, 9.99900568, 10.00068384, 10.00030046, 10.00024002, 9.99945741, 10.0001018 , 10.00002712, 9.99980739, 10.00022972, 10.00042609, 10.00061149, 9.9993555 , 9.99917595, 9.99991866, 9.99871064])
持久化计算到内存中,加速后续计算
import dask.array as da
x = da.random.random((10000, 10000), chunks=(1000, 1000))
x
|
y = x + x.T
z = y[::2, 5000:].mean(axis=1)
z
|
%time z.compute()
CPU times: user 278 ms, sys: 23.3 ms, total: 302 ms Wall time: 514 ms
array([1.01400246, 0.99859011, 1.00048605, ..., 1.00554574, 1.00343839, 1.00203487])
# 持久化
y = y.persist()
%time y[0, 0].compute()
CPU times: user 12.1 ms, sys: 2.99 ms, total: 15.1 ms Wall time: 14.1 ms
0.0832880607724471
%time y[0, 0].compute()
CPU times: user 7.52 ms, sys: 1.94 ms, total: 9.47 ms Wall time: 10.1 ms
0.0832880607724471
# 清楚内存占用
client.cancel(y)
# or del y
client.shutdown()