Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Gal Matijevic
desymlworkshop
Commits
751ceb4e
Commit
751ceb4e
authored
Nov 12, 2020
by
mtjvc
Browse files
notebook
parents
Changes
19
Expand all
Hide whitespace changes
Inline
Side-by-side
1_unsupervised/.ipynb_checkpoints/unsupervised.1-checkpoint.ipynb
0 → 100644
View file @
751ceb4e
%% Cell type:markdown id: tags:
# 1. Unsupervised learning - Part 1
%% Cell type:markdown id: tags:
### Literature / Sources
#### Papers
*
[
Visualizing High-Dimensional Data Using t-SNE, van der Maaten & Hinton, 2008
](
https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf
)
*
[
Accelerating t-SNE using Tree-Based Algorithms, van der Maaten, 2014
](
https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf
)
#### Links
*
[
RAVE Survey
](
https://www.rave-survey.org/
)
*
[
DmitryUlyanov / Multicore-TSNE
](
https://github.com/DmitryUlyanov/Multicore-TSNE
)
*
[
scikit-learn
](
https://scikit-learn.org/stable/
)
*
[
hdbscan
](
https://hdbscan.readthedocs.io/en/latest/index.html
)
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
```
python
import
os
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
pandas
as
pd
from
sklearn.decomposition
import
PCA
from
sklearn.preprocessing
import
StandardScaler
from
MulticoreTSNE
import
MulticoreTSNE
as
TSNE
import
hdbscan
DATA_DIR
=
os
.
path
.
join
(
'..'
,
'data'
,
'rave'
)
LIVE
=
False
```
%% Cell type:markdown id: tags:
## Data
%% Cell type:code id: tags:
```
python
spectra
=
np
.
load
(
os
.
path
.
join
(
DATA_DIR
,
'ravedr6.spectra.npy'
))
```
%% Cell type:code id: tags:
```
python
spectra
.
shape
```
%% Cell type:code id: tags:
```
python
wl_range
=
8446.12176814
+
np
.
cumsum
(
np
.
ones
(
1024
)
*
0.30025021
)
```
%% Cell type:code id: tags:
```
python
def
plot_spectra
(
spectra
,
wl_range
=
wl_range
,
txt
=
None
):
nspectra
=
min
(
10
,
len
(
spectra
))
plt
.
figure
(
figsize
=
(
14
,
1.5
*
nspectra
))
ax
=
plt
.
subplot
(
111
)
for
i
in
range
(
nspectra
):
ax
.
plot
(
wl_range
,
spectra
[
i
]
+
i
*
0.7
,
'k-'
)
if
txt
is
not
None
:
for
i
,
t
in
enumerate
(
txt
):
ax
.
text
(
wl_range
[
20
],
i
*
0.7
+
1.1
,
t
)
ax
.
set_ylim
(
0
,
0.7
+
0.7
*
nspectra
)
ax
.
set_xlim
(
wl_range
[
0
],
wl_range
[
-
1
])
y0
,
y1
=
ax
.
get_ylim
()
for
wl
in
(
8498
,
8542
,
8662
):
ax
.
plot
([
wl
,
wl
],
[
y0
,
y1
],
'k-'
,
alpha
=
0.3
)
ax
.
set_xlabel
(
r
'Wavelength $\mathrm{[\AA]}$'
)
ax
.
set_ylabel
(
r
'Flux'
)
```
%% Cell type:code id: tags:
```
python
plot_spectra
(
spectra
[
np
.
random
.
randint
(
spectra
.
shape
[
0
],
size
=
10
)])
```
%% Cell type:markdown id: tags:
<br><br><br><br><br>
## Dimensionality reduction
%% Cell type:markdown id: tags:

_Source: https://subscription.packtpub.com/book/data/9781789955750/1/ch01lvl1sec03/the-three-different-types-of-machine-learning_
%% Cell type:markdown id: tags:
## Principal Component Analysis (PCA)
%% Cell type:code id: tags:
```
python
spectra_std
=
StandardScaler
().
fit_transform
(
spectra
)
#spectra_std = (spectra - np.mean(spectra, axis=0)) / np.std(spectra, axis=0)
```
%% Cell type:code id: tags:
```
python
%%
time
pca
=
PCA
(
n_components
=
2
)
Ypca
=
pca
.
fit_transform
(
spectra_std
)
```
%% Cell type:code id: tags:
```
python
Ypca
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
8
,
8
))
extent
=
np
.
array
((
-
1
,
1
,
-
1
,
1
))
*
18
hb
=
plt
.
hexbin
(
Ypca
[:,
0
],
Ypca
[:,
1
],
gridsize
=
80
,
extent
=
extent
,
mincnt
=
1
)
plt
.
axis
(
extent
);
```
%% Cell type:code id: tags:
```
python
%%
time
cov
=
np
.
cov
(
spectra_std
.
T
)
eigen_values
,
eigen_vectors
=
np
.
linalg
.
eig
(
cov
)
Ypca_hm
=
np
.
dot
(
spectra_std
,
eigen_vectors
[:,
:
2
])
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
8
,
8
))
hb
=
plt
.
hexbin
(
-
Ypca_hm
[:,
0
],
-
Ypca_hm
[:,
1
],
gridsize
=
80
,
mincnt
=
1
,
extent
=
extent
)
plt
.
axis
(
extent
);
```
%% Cell type:code id: tags:
```
python
plot_spectra
(
spectra
[
np
.
argwhere
(
Ypca
[:,
1
]
>
10
)].
squeeze
())
```
%% Cell type:markdown id: tags:
## t-distributed Stochastic Neighbor Embedding (t-SNE)
%% Cell type:code id: tags:
```
python
%%
time
if
LIVE
:
tsne
=
TSNE
(
n_jobs
=
6
)
Y
=
tsne
.
fit_transform
(
spectra
[:
10000
])
else
:
# load t-SNE
Y
=
np
.
load
(
os
.
path
.
join
(
DATA_DIR
,
'ravedr6.tsne.npy'
))
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
10
,
8
))
extent
=
np
.
array
((
-
1
,
1
,
-
1
,
1
))
*
30
hb
=
plt
.
hexbin
(
Y
[:,
0
],
Y
[:,
1
],
gridsize
=
(
80
,
40
),
lw
=
0.3
,
extent
=
extent
,
mincnt
=
1
)
plt
.
axis
(
extent
);
plt
.
colorbar
();
```
%% Cell type:markdown id: tags:
## Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBscan)
%% Cell type:code id: tags:
```
python
clusterer
=
hdbscan
.
HDBSCAN
(
min_cluster_size
=
100
,
min_samples
=
50
)
```
%% Cell type:code id: tags:
```
python
clusterer
.
fit
(
Y
)
```
%% Cell type:code id: tags:
```
python
clusterer
.
labels_
```
%% Cell type:code id: tags:
```
python
clusterer
.
labels_
.
max
()
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
10
,
8
))
extent
=
np
.
array
((
-
1
,
1
,
-
1
,
1
))
*
30
hb
=
plt
.
hexbin
(
Y
[:,
0
],
Y
[:,
1
],
C
=
clusterer
.
labels_
,
gridsize
=
(
80
,
40
),
lw
=
0.3
,
extent
=
extent
,
cmap
=
plt
.
cm
.
get_cmap
(
'gist_ncar'
,
9
),
vmin
=-
1.5
,
vmax
=
7.5
)
plt
.
axis
(
extent
)
cbar
=
plt
.
colorbar
()
cbar
.
set_label
(
'Labels'
)
```
%% Cell type:code id: tags:
```
python
sel
=
spectra
[
clusterer
.
labels_
==
1
]
plot_spectra
(
sel
[
np
.
random
.
randint
(
0
,
len
(
sel
)
+
1
,
10
)])
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
18
,
10
))
ax
=
plt
.
subplot
(
111
)
clusterer
.
condensed_tree_
.
plot
(
select_clusters
=
True
,
axis
=
ax
);
```
%% Cell type:code id: tags:
```
python
```
1_unsupervised/.ipynb_checkpoints/unsupervised.2-checkpoint.ipynb
0 → 100644
View file @
751ceb4e
%% Cell type:markdown id: tags:
# 1. Unsupervised learning - Part 2 - Autoencoder
%% Cell type:markdown id: tags:
### Literature / Sources
*
[
Reducing the Dimensionality of Data with Neural Networks, Hinton & Salakhutdinov, 2006
](
https://www.cs.toronto.edu/~hinton/science.pdf
)
<br>
*
[
Variational Autoencoder
](
https://keras.io/examples/generative/vae/
)
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
```
python
import
os
import
pickle
import
matplotlib.pyplot
as
plt
import
numpy
as
np
import
pandas
as
pd
import
tensorflow
as
tf
from
sklearn
import
preprocessing
from
sklearn.model_selection
import
train_test_split
```
%% Cell type:code id: tags:
```
python
import
tensorflow
as
tf
from
tensorflow
import
keras
from
tensorflow.keras
import
layers
from
tensorflow.keras.optimizers
import
Adam
import
keras.backend
as
K
DATA_DIR
=
os
.
path
.
join
(
'..'
,
'data'
,
'rave'
)
TRAINED_DIR
=
os
.
path
.
join
(
'..'
,
'data'
,
'trained'
)
LIVE
=
False
```
%% Cell type:code id: tags:
```
python
tf
.
config
.
list_physical_devices
(
'GPU'
)
```
%% Cell type:code id: tags:
```
python
tf
.
config
.
experimental
.
set_memory_growth
(
tf
.
config
.
list_physical_devices
(
'GPU'
)[
0
],
True
)
```
%% Cell type:code id: tags:
```
python
tf
.
__version__
```
%% Cell type:markdown id: tags:
## Data
%% Cell type:code id: tags:
```
python
spectra
=
np
.
load
(
os
.
path
.
join
(
DATA_DIR
,
'ravedr6.spectra.npy'
))
```
%% Cell type:code id: tags:
```
python
rave
=
pd
.
read_csv
(
os
.
path
.
join
(
DATA_DIR
,
'ravedr6.csv'
))
```
%% Cell type:code id: tags:
```
python
labels
=
pd
.
read_csv
(
os
.
path
.
join
(
DATA_DIR
,
'ravedr6.apogee.csv'
))
```
%% Cell type:code id: tags:
```
python
#nspec_idx = rave.index
#rand_sel = np.random.choice(range(len(nspec_idx)), 10000, replace=False)
#spec_sample = spectra[nspec_idx][rand_sel]
```
%% Cell type:code id: tags:
```
python
spec_sample
=
spectra
[
rave
[
rave
.
RAVE_OBS_ID
.
isin
(
labels
.
RAVE_OBS_ID
)].
index
]
```
%% Cell type:code id: tags:
```
python
spec_sample
.
shape
```
%% Cell type:code id: tags:
```
python
wl_range
=
8446.12176814
+
np
.
cumsum
(
np
.
ones
(
1024
)
*
0.30025021
)
def
plot_spectra
(
spectra
,
wl_range
=
wl_range
,
txt
=
None
):
nspectra
=
min
(
10
,
len
(
spectra
))
plt
.
figure
(
figsize
=
(
14
,
1.5
*
nspectra
))
ax
=
plt
.
subplot
(
111
)
for
i
in
range
(
nspectra
):
ax
.
plot
(
wl_range
,
spectra
[
i
]
+
i
*
0.7
,
'k-'
)
if
txt
is
not
None
:
for
i
,
t
in
enumerate
(
txt
):
ax
.
text
(
wl_range
[
20
],
i
*
0.7
+
1.1
,
t
)
ax
.
set_ylim
(
0
,
0.7
+
0.7
*
nspectra
)
ax
.
set_xlim
(
wl_range
[
0
],
wl_range
[
-
1
])
y0
,
y1
=
ax
.
get_ylim
()
for
wl
in
(
8498
,
8542
,
8662
):
ax
.
plot
([
wl
,
wl
],
[
y0
,
y1
],
'k-'
,
alpha
=
0.3
)
ax
.
set_xlabel
(
r
'Wavelength $\mathrm{[\AA]}$'
)
ax
.
set_ylabel
(
r
'Flux'
)
```
%% Cell type:markdown id: tags:
## Architecture
%% Cell type:markdown id: tags:

_Source: https://mc.ai/auto-encoder-in-biology/_
%% Cell type:code id: tags:
```
python
latent_dim
=
2
encoder_ipt
=
layers
.
Input
(
shape
=
(
1024
,
1
))
x
=
layers
.
Convolution1D
(
16
,
kernel_size
=
3
,
padding
=
'same'
,
name
=
'convolution1'
)(
encoder_ipt
)
x
=
layers
.
BatchNormalization
()(
x
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
layers
.
MaxPooling1D
(
pool_size
=
2
,
padding
=
'same'
)(
x
)
x
=
layers
.
Convolution1D
(
32
,
kernel_size
=
3
,
padding
=
'same'
,
name
=
'convolution2'
)(
x
)
x
=
layers
.
BatchNormalization
()(
x
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
layers
.
MaxPooling1D
(
pool_size
=
2
,
padding
=
'same'
)(
x
)
x
=
layers
.
Flatten
()(
x
)
x
=
layers
.
Dense
(
32
,
name
=
'dense1'
)(
x
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
layers
.
Dense
(
latent_dim
,
name
=
'dense2'
)(
x
)
x
=
layers
.
LeakyReLU
()(
x
)
encoder
=
keras
.
Model
(
encoder_ipt
,
x
,
name
=
'encoder'
)
```
%% Cell type:code id: tags:
```
python
encoder
.
summary
()
```
%% Cell type:code id: tags:
```
python
def
Conv1DTranspose
(
input_tensor
,
filters
,
kernel_size
,
strides
=
2
,
padding
=
'same'
):
# Tensorflow v2.3.0 has Conv1DTranspose, but 2.2.0 does not
x
=
layers
.
Lambda
(
lambda
x
:
K
.
expand_dims
(
x
,
axis
=
2
))(
input_tensor
)
x
=
layers
.
Conv2DTranspose
(
filters
=
filters
,
kernel_size
=
(
kernel_size
,
1
),
strides
=
(
strides
,
1
),
padding
=
padding
)(
x
)
x
=
layers
.
Lambda
(
lambda
x
:
K
.
squeeze
(
x
,
axis
=
2
))(
x
)
return
x
```
%% Cell type:code id: tags:
```
python
decoder_ipt
=
layers
.
Input
(
shape
=
(
latent_dim
,))
x
=
layers
.
Dense
(
32
,
name
=
'dense3'
)(
decoder_ipt
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
layers
.
Dense
(
8192
,
name
=
'dense4'
)(
x
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
layers
.
Reshape
((
256
,
32
))(
x
)
x
=
Conv1DTranspose
(
x
,
16
,
3
,
padding
=
'same'
)
x
=
layers
.
LeakyReLU
()(
x
)
x
=
Conv1DTranspose
(
x
,
1
,
3
,
padding
=
'same'
)
x
=
layers
.
LeakyReLU
()(
x
)
decoder
=
keras
.
Model
(
decoder_ipt
,
x
,
name
=
'decoder'
)
```
%% Cell type:code id: tags:
```
python
decoder
.
summary
()
```
%% Cell type:markdown id: tags:
[
Writing a training loop from scratch
](
https://keras.io/guides/writing_a_training_loop_from_scratch/
)
%% Cell type:code id: tags:
```
python
class
AutoEncoder
(
keras
.
Model
):
def
__init__
(
self
,
encoder
,
decoder
,
**
kwargs
):
super
(
AutoEncoder
,
self
).
__init__
(
**
kwargs
)
self
.
encoder
=
encoder
self
.
decoder
=
decoder
def
train_step
(
self
,
data
):
with
tf
.
GradientTape
()
as
tape
:
reconstruction
=
self
.
decoder
(
self
.
encoder
(
data
))
loss
=
tf
.
reduce_mean
(
keras
.
losses
.
mse
(
data
,
reconstruction
))
grads
=
tape
.
gradient
(
loss
,
self
.
trainable_weights
)
self
.
optimizer
.
apply_gradients
(
zip
(
grads
,
self
.
trainable_weights
))
return
{
"loss"
:
loss
}
```
%% Cell type:markdown id: tags:
## Training
%% Cell type:code id: tags:
```
python
ae
=
AutoEncoder
(
encoder
,
decoder
)
ae
.
compile
(
optimizer
=
'adam'
)
```
%% Cell type:code id: tags:
```
python
data
=
np
.
expand_dims
(
spec_sample
,
-
1
)
```
%% Cell type:code id: tags:
```
python
if
LIVE
:
history
=
ae
.
fit
(
data
,
epochs
=
1024
,
batch_size
=
32
)
else
:
ae
.
load_weights
(
os
.
path
.
join
(
TRAINED_DIR
,
'ae.h5'
))
```
%% Cell type:code id: tags:
```
python
#ae.save_weights('ae.h5')
```
%% Cell type:markdown id: tags:
## Results
%% Cell type:code id: tags:
```
python
res
=
encoder
.
predict
(
data
)
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
8
,
6
))
extent
=
(
-
0.2
,
1.5
,
-
0.4
,
1.8
)
plt
.
hexbin
(
res
[:,
0
],
res
[:,
1
],
gridsize
=
(
60
,
30
),
mincnt
=
1
,
C
=
labels
.
FeH
,
extent
=
extent
)
plt
.
colorbar
();
```
%% Cell type:code id: tags:
```
python
plot_spectra
(
spec_sample
[
np
.
argwhere
((
res
[:,
1
]
>
0.45
)
&
(
res
[:,
1
]
<
0.55
))].
squeeze
())
```
%% Cell type:code id: tags:
```
python
[[
i
,
1.0
]
for
i
in
np
.
arange
(
0.0
,
1.5
,
0.15
)]
```
%% Cell type:code id: tags:
```
python
#spec = decoder.predict([[i, 1.0] for i in np.arange(0.0, 1.5, 0.15)])
spec
=
decoder
.
predict
([[
0.75
,
i
]
for
i
in
np
.
arange
(
0.0
,
1.5
,
0.15
)])
plot_spectra
(
spec
.
reshape
((
10
,
1024
)));
```
%% Cell type:code id: tags:
```
python
i
=
13
spec
=
spec_sample
[
i
].
squeeze
()
gen_spec
=
decoder
.
predict
(
res
[
i
:
i
+
1
]).
flatten
()
plt
.
figure
(
figsize
=
(
14
,
4
))
plt
.
plot
(
wl_range
,
spec
,
'k'
)
plt
.
plot
(
wl_range
,
gen_spec
,
'r'
);
plt
.
plot
([
8450
,
8750
],
[
0
,
0
],
'k--'
,
lw
=
1
)
plt
.
plot
(
wl_range
,
spec
-
gen_spec
)
plt
.
text
(
8700
,
0.15
,
r
'$\mu=%.3f\quad\sigma=%.3f$'
%
(
np
.
mean
(
spec
-
gen_spec
),
np
.
std
(
spec
-
gen_spec
)))
plt
.
xlim
(
8450
,
8750
)
plt
.
ylim
(
-
0.2
,
1.4
);
```
%% Cell type:code id: tags:
```
python
plt
.
figure
(
figsize
=
(
14
,
8
))
ax
=
plt
.
axes
([
0.1
,
0.1
,
0.9
,
0.9
])
plt
.
scatter
(
res
[:,
0
],
res
[:,
1
],
c
=
labels
.
FeH
,
s
=
20
)
plt
.
axis
(
extent
)
xx
=
np
.
linspace
(
*
extent
[:
2
],
5
)
yy
=
np
.
linspace
(
*
extent
[
2
:],
5
)
for
j
in
range
(
5
):
for
i
in
range
(
5
):
ax
=
plt
.
axes
([
0.1
+
i
*
0.19
,
0.1
+
j
*
0.19
,
0.14
,
0.14
])
s
=
decoder
.
predict
([[
xx
[
i
],
yy
[
j
]]])
ax
.
plot
(
s
[
0
].
flatten
());