Commit 7c4dea9f authored by mtjvc's avatar mtjvc
Browse files

remove checkpoints

parent a0a8848f
%% Cell type:markdown id: tags:
# 1. Unsupervised learning - Part 1
%% Cell type:markdown id: tags:
### Literature / Sources
#### Papers
* [Visualizing High-Dimensional Data Using t-SNE, van der Maaten & Hinton, 2008](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)
* [Accelerating t-SNE using Tree-Based Algorithms, van der Maaten, 2014](https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf)
#### Links
* [RAVE Survey](https://www.rave-survey.org/)
* [DmitryUlyanov / Multicore-TSNE](https://github.com/DmitryUlyanov/Multicore-TSNE)
* [scikit-learn](https://scikit-learn.org/stable/)
* [hdbscan](https://hdbscan.readthedocs.io/en/latest/index.html)
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
``` python
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from MulticoreTSNE import MulticoreTSNE as TSNE
import hdbscan
DATA_DIR = os.path.join('..', 'data', 'rave')
LIVE = False
```
%% Cell type:markdown id: tags:
## Data
%% Cell type:code id: tags:
``` python
spectra = np.load(os.path.join(DATA_DIR, 'ravedr6.spectra.npy'))
```
%% Cell type:code id: tags:
``` python
spectra.shape
```
%% Cell type:code id: tags:
``` python
wl_range = 8446.12176814 + np.cumsum(np.ones(1024) * 0.30025021)
```
%% Cell type:code id: tags:
``` python
def plot_spectra(spectra, wl_range=wl_range, txt=None):
nspectra = min(10, len(spectra))
plt.figure(figsize=(14, 1.5 * nspectra))
ax = plt.subplot(111)
for i in range(nspectra):
ax.plot(wl_range, spectra[i] + i * 0.7, 'k-')
if txt is not None:
for i, t in enumerate(txt):
ax.text(wl_range[20], i * 0.7 + 1.1, t)
ax.set_ylim(0, 0.7 + 0.7 * nspectra)
ax.set_xlim(wl_range[0], wl_range[-1])
y0, y1 = ax.get_ylim()
for wl in (8498, 8542, 8662):
ax.plot([wl, wl], [y0, y1], 'k-', alpha=0.3)
ax.set_xlabel(r'Wavelength $\mathrm{[\AA]}$')
ax.set_ylabel(r'Flux')
```
%% Cell type:code id: tags:
``` python
plot_spectra(spectra[np.random.randint(spectra.shape[0], size=10)])
```
%% Cell type:markdown id: tags:
<br><br><br><br><br>
## Dimensionality reduction
%% Cell type:markdown id: tags:
![title](https://static.packt-cdn.com/products/9781789955750/graphics/Images/B13208_01_07.png)
_Source: https://subscription.packtpub.com/book/data/9781789955750/1/ch01lvl1sec03/the-three-different-types-of-machine-learning_
%% Cell type:markdown id: tags:
## Principal Component Analysis (PCA)
%% Cell type:code id: tags:
``` python
spectra_std = StandardScaler().fit_transform(spectra)
#spectra_std = (spectra - np.mean(spectra, axis=0)) / np.std(spectra, axis=0)
```
%% Cell type:code id: tags:
``` python
%%time
pca = PCA(n_components=2)
Ypca = pca.fit_transform(spectra_std)
```
%% Cell type:code id: tags:
``` python
Ypca
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(8, 8))
extent = np.array((-1, 1, -1, 1)) * 18
hb = plt.hexbin(Ypca[:, 0], Ypca[:, 1], gridsize=80, extent=extent, mincnt=1)
plt.axis(extent);
```
%% Cell type:code id: tags:
``` python
%%time
cov = np.cov(spectra_std.T)
eigen_values, eigen_vectors = np.linalg.eig(cov)
Ypca_hm = np.dot(spectra_std, eigen_vectors[:, :2])
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(8, 8))
hb = plt.hexbin(-Ypca_hm[:, 0], -Ypca_hm[:, 1], gridsize=80, mincnt=1, extent=extent)
plt.axis(extent);
```
%% Cell type:code id: tags:
``` python
plot_spectra(spectra[np.argwhere(Ypca[:, 1] > 10)].squeeze())
```
%% Cell type:markdown id: tags:
## t-distributed Stochastic Neighbor Embedding (t-SNE)
%% Cell type:code id: tags:
``` python
%%time
if LIVE:
tsne = TSNE(n_jobs=6)
Y = tsne.fit_transform(spectra[:10000])
else:
# load t-SNE
Y = np.load(os.path.join(DATA_DIR, 'ravedr6.tsne.npy'))
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
extent = np.array((-1, 1, -1, 1)) * 30
hb = plt.hexbin(Y[:, 0], Y[:, 1], gridsize=(80, 40), lw=0.3, extent=extent, mincnt=1)
plt.axis(extent);
plt.colorbar();
```
%% Cell type:markdown id: tags:
## Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBscan)
%% Cell type:code id: tags:
``` python
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=50)
```
%% Cell type:code id: tags:
``` python
clusterer.fit(Y)
```
%% Cell type:code id: tags:
``` python
clusterer.labels_
```
%% Cell type:code id: tags:
``` python
clusterer.labels_.max()
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
extent = np.array((-1, 1, -1, 1)) * 30
hb = plt.hexbin(Y[:, 0], Y[:, 1], C=clusterer.labels_, gridsize=(80, 40), lw=0.3, extent=extent,
cmap=plt.cm.get_cmap('gist_ncar', 9), vmin=-1.5, vmax=7.5)
plt.axis(extent)
cbar = plt.colorbar()
cbar.set_label('Labels')
```
%% Cell type:code id: tags:
``` python
sel = spectra[clusterer.labels_ == 1]
plot_spectra(sel[np.random.randint(0, len(sel) + 1, 10)])
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(18, 10))
ax = plt.subplot(111)
clusterer.condensed_tree_.plot(select_clusters=True, axis=ax);
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:markdown id: tags:
# 1. Unsupervised learning - Part 2 - Autoencoder
%% Cell type:markdown id: tags:
### Literature / Sources
* [Reducing the Dimensionality of Data with Neural Networks, Hinton & Salakhutdinov, 2006](https://www.cs.toronto.edu/~hinton/science.pdf)
<br>
* [Variational Autoencoder](https://keras.io/examples/generative/vae/)
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
``` python
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
```
%% Cell type:code id: tags:
``` python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import keras.backend as K
DATA_DIR = os.path.join('..', 'data', 'rave')
TRAINED_DIR = os.path.join('..', 'data', 'trained')
LIVE = False
```
%% Cell type:code id: tags:
``` python
tf.config.list_physical_devices('GPU')
```
%% Cell type:code id: tags:
``` python
tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
```
%% Cell type:code id: tags:
``` python
tf.__version__
```
%% Cell type:markdown id: tags:
## Data
%% Cell type:code id: tags:
``` python
spectra = np.load(os.path.join(DATA_DIR, 'ravedr6.spectra.npy'))
```
%% Cell type:code id: tags:
``` python
rave = pd.read_csv(os.path.join(DATA_DIR, 'ravedr6.csv'))
```
%% Cell type:code id: tags:
``` python
labels = pd.read_csv(os.path.join(DATA_DIR, 'ravedr6.apogee.csv'))
```
%% Cell type:code id: tags:
``` python
#nspec_idx = rave.index
#rand_sel = np.random.choice(range(len(nspec_idx)), 10000, replace=False)
#spec_sample = spectra[nspec_idx][rand_sel]
```
%% Cell type:code id: tags:
``` python
spec_sample = spectra[rave[rave.RAVE_OBS_ID.isin(labels.RAVE_OBS_ID)].index]
```
%% Cell type:code id: tags:
``` python
spec_sample.shape
```
%% Cell type:code id: tags:
``` python
wl_range = 8446.12176814 + np.cumsum(np.ones(1024) * 0.30025021)
def plot_spectra(spectra, wl_range=wl_range, txt=None):
nspectra = min(10, len(spectra))
plt.figure(figsize=(14, 1.5 * nspectra))
ax = plt.subplot(111)
for i in range(nspectra):
ax.plot(wl_range, spectra[i] + i * 0.7, 'k-')
if txt is not None:
for i, t in enumerate(txt):
ax.text(wl_range[20], i * 0.7 + 1.1, t)
ax.set_ylim(0, 0.7 + 0.7 * nspectra)
ax.set_xlim(wl_range[0], wl_range[-1])
y0, y1 = ax.get_ylim()
for wl in (8498, 8542, 8662):
ax.plot([wl, wl], [y0, y1], 'k-', alpha=0.3)
ax.set_xlabel(r'Wavelength $\mathrm{[\AA]}$')
ax.set_ylabel(r'Flux')
```
%% Cell type:markdown id: tags:
## Architecture
%% Cell type:markdown id: tags:
![](https://miro.medium.com/max/1000/0*uq2_ZipB9TqI9G_k)
_Source: https://mc.ai/auto-encoder-in-biology/_
%% Cell type:code id: tags:
``` python
latent_dim = 2
encoder_ipt = layers.Input(shape=(1024, 1))
x = layers.Convolution1D(16, kernel_size=3, padding='same', name='convolution1')(encoder_ipt)
x = layers.BatchNormalization()(x)
x = layers.LeakyReLU()(x)
x = layers.MaxPooling1D(pool_size=2, padding='same')(x)
x = layers.Convolution1D(32, kernel_size=3, padding='same', name='convolution2')(x)
x = layers.BatchNormalization()(x)
x = layers.LeakyReLU()(x)
x = layers.MaxPooling1D(pool_size=2, padding='same')(x)
x = layers.Flatten()(x)
x = layers.Dense(32, name='dense1')(x)
x = layers.LeakyReLU()(x)
x = layers.Dense(latent_dim, name='dense2')(x)
x = layers.LeakyReLU()(x)
encoder = keras.Model(encoder_ipt, x, name='encoder')
```
%% Cell type:code id: tags:
``` python
encoder.summary()
```
%% Cell type:code id: tags:
``` python
def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'):
# Tensorflow v2.3.0 has Conv1DTranspose, but 2.2.0 does not
x = layers.Lambda(lambda x: K.expand_dims(x, axis=2))(input_tensor)
x = layers.Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x)
x = layers.Lambda(lambda x: K.squeeze(x, axis=2))(x)
return x
```
%% Cell type:code id: tags:
``` python
decoder_ipt = layers.Input(shape=(latent_dim,))
x = layers.Dense(32, name='dense3')(decoder_ipt)
x = layers.LeakyReLU()(x)
x = layers.Dense(8192, name='dense4')(x)
x = layers.LeakyReLU()(x)
x = layers.Reshape((256, 32))(x)
x = Conv1DTranspose(x, 16, 3, padding='same')
x = layers.LeakyReLU()(x)
x = Conv1DTranspose(x, 1, 3, padding='same')
x = layers.LeakyReLU()(x)
decoder = keras.Model(decoder_ipt, x, name='decoder')
```
%% Cell type:code id: tags:
``` python
decoder.summary()
```
%% Cell type:markdown id: tags:
[Writing a training loop from scratch](https://keras.io/guides/writing_a_training_loop_from_scratch/)
%% Cell type:code id: tags:
``` python
class AutoEncoder(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(AutoEncoder, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def train_step(self, data):
with tf.GradientTape() as tape:
reconstruction = self.decoder(self.encoder(data))
loss = tf.reduce_mean(keras.losses.mse(data, reconstruction))
grads = tape.gradient(loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
return {"loss": loss}
```
%% Cell type:markdown id: tags:
## Training
%% Cell type:code id: tags:
``` python
ae = AutoEncoder(encoder, decoder)
ae.compile(optimizer='adam')
```
%% Cell type:code id: tags:
``` python
data = np.expand_dims(spec_sample, -1)
```
%% Cell type:code id: tags:
``` python
if LIVE:
history = ae.fit(data, epochs=1024, batch_size=32)
else:
ae.load_weights(os.path.join(TRAINED_DIR, 'ae.h5'))
```
%% Cell type:code id: tags:
``` python
#ae.save_weights('ae.h5')
```
%% Cell type:markdown id: tags:
## Results
%% Cell type:code id: tags:
``` python
res = encoder.predict(data)
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(8, 6))
extent = (-0.2, 1.5, -0.4, 1.8)
plt.hexbin(res[:, 0], res[:, 1], gridsize=(60, 30), mincnt=1, C=labels.FeH, extent=extent)
plt.colorbar();
```
%% Cell type:code id: tags:
``` python
plot_spectra(spec_sample[np.argwhere((res[:, 1] > 0.45) & (res[:, 1] < 0.55))].squeeze())
```
%% Cell type:code id: tags:
``` python
[[i, 1.0] for i in np.arange(0.0, 1.5, 0.15)]
```
%% Cell type:code id: tags:
``` python
#spec = decoder.predict([[i, 1.0] for i in np.arange(0.0, 1.5, 0.15)])
spec = decoder.predict([[0.75, i] for i in np.arange(0.0, 1.5, 0.15)])
plot_spectra(spec.reshape((10, 1024)));
```
%% Cell type:code id: tags:
``` python
i = 13
spec = spec_sample[i].squeeze()
gen_spec = decoder.predict(res[i:i + 1]).flatten()
plt.figure(figsize=(14, 4))
plt.plot(wl_range, spec, 'k')
plt.plot(wl_range, gen_spec, 'r');
plt.plot([8450, 8750], [0, 0], 'k--', lw=1)
plt.plot(wl_range, spec - gen_spec)
plt.text(8700, 0.15, r'$\mu=%.3f\quad\sigma=%.3f$' % (np.mean(spec - gen_spec), np.std(spec - gen_spec)))
plt.xlim(8450, 8750)
plt.ylim(-0.2, 1.4);
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(14, 8))
ax = plt.axes([0.1, 0.1, 0.9, 0.9])
plt.scatter(res[:, 0], res[:, 1], c=labels.FeH, s=20)
plt.axis(extent)
xx = np.linspace(*extent[:2], 5)
yy = np.linspace(*extent[2:], 5)
for j in range(5):
for i in range(5):
ax = plt.axes([0.1 + i * 0.19, 0.1 + j * 0.19, 0.14, 0.14])
s = decoder.predict([[xx[i], yy[j]]])
ax.plot(s[0].flatten());
ax.set_ylim(0, 1.4)
ax.set_xticks([])
ax.set_yticks([])