Commit 7c4dea9f authored by mtjvc's avatar mtjvc
Browse files

remove checkpoints

parent a0a8848f
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Unsupervised learning - Part 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Literature / Sources\n",
"\n",
"#### Papers\n",
"* [Visualizing High-Dimensional Data Using t-SNE, van der Maaten & Hinton, 2008](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)\n",
"* [Accelerating t-SNE using Tree-Based Algorithms, van der Maaten, 2014](https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf)\n",
"\n",
"#### Links\n",
"* [RAVE Survey](https://www.rave-survey.org/)\n",
"* [DmitryUlyanov / Multicore-TSNE](https://github.com/DmitryUlyanov/Multicore-TSNE)\n",
"* [scikit-learn](https://scikit-learn.org/stable/)\n",
"* [hdbscan](https://hdbscan.readthedocs.io/en/latest/index.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"from MulticoreTSNE import MulticoreTSNE as TSNE\n",
"import hdbscan\n",
"\n",
"DATA_DIR = os.path.join('..', 'data', 'rave')\n",
"LIVE = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spectra = np.load(os.path.join(DATA_DIR, 'ravedr6.spectra.npy'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spectra.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wl_range = 8446.12176814 + np.cumsum(np.ones(1024) * 0.30025021)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_spectra(spectra, wl_range=wl_range, txt=None):\n",
" nspectra = min(10, len(spectra))\n",
" plt.figure(figsize=(14, 1.5 * nspectra))\n",
" ax = plt.subplot(111)\n",
" for i in range(nspectra):\n",
" ax.plot(wl_range, spectra[i] + i * 0.7, 'k-')\n",
" if txt is not None:\n",
" for i, t in enumerate(txt):\n",
" ax.text(wl_range[20], i * 0.7 + 1.1, t)\n",
" ax.set_ylim(0, 0.7 + 0.7 * nspectra)\n",
" ax.set_xlim(wl_range[0], wl_range[-1])\n",
" y0, y1 = ax.get_ylim()\n",
" for wl in (8498, 8542, 8662):\n",
" ax.plot([wl, wl], [y0, y1], 'k-', alpha=0.3)\n",
" ax.set_xlabel(r'Wavelength $\\mathrm{[\\AA]}$')\n",
" ax.set_ylabel(r'Flux')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_spectra(spectra[np.random.randint(spectra.shape[0], size=10)])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br><br><br><br><br>\n",
"## Dimensionality reduction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![title](https://static.packt-cdn.com/products/9781789955750/graphics/Images/B13208_01_07.png)\n",
"_Source: https://subscription.packtpub.com/book/data/9781789955750/1/ch01lvl1sec03/the-three-different-types-of-machine-learning_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Principal Component Analysis (PCA)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spectra_std = StandardScaler().fit_transform(spectra)\n",
"#spectra_std = (spectra - np.mean(spectra, axis=0)) / np.std(spectra, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"pca = PCA(n_components=2)\n",
"Ypca = pca.fit_transform(spectra_std)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Ypca"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(8, 8))\n",
"extent = np.array((-1, 1, -1, 1)) * 18\n",
"hb = plt.hexbin(Ypca[:, 0], Ypca[:, 1], gridsize=80, extent=extent, mincnt=1)\n",
"plt.axis(extent);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"cov = np.cov(spectra_std.T)\n",
"eigen_values, eigen_vectors = np.linalg.eig(cov)\n",
"Ypca_hm = np.dot(spectra_std, eigen_vectors[:, :2])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(8, 8))\n",
"hb = plt.hexbin(-Ypca_hm[:, 0], -Ypca_hm[:, 1], gridsize=80, mincnt=1, extent=extent)\n",
"plt.axis(extent);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_spectra(spectra[np.argwhere(Ypca[:, 1] > 10)].squeeze())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## t-distributed Stochastic Neighbor Embedding (t-SNE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"if LIVE:\n",
" tsne = TSNE(n_jobs=6)\n",
" Y = tsne.fit_transform(spectra[:10000])\n",
"else:\n",
" # load t-SNE\n",
" Y = np.load(os.path.join(DATA_DIR, 'ravedr6.tsne.npy'))"
]
},
{
"cell_type": "code",
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Unsupervised learning - Part 2 - Autoencoder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Literature / Sources\n",
"\n",
"* [Reducing the Dimensionality of Data with Neural Networks, Hinton & Salakhutdinov, 2006](https://www.cs.toronto.edu/~hinton/science.pdf)\n",
"<br>\n",
"* [Variational Autoencoder](https://keras.io/examples/generative/vae/)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pickle\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"\n",
"from sklearn import preprocessing\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"from tensorflow.keras import layers\n",
"from tensorflow.keras.optimizers import Adam\n",
"\n",
"import keras.backend as K\n",
"\n",
"DATA_DIR = os.path.join('..', 'data', 'rave')\n",
"TRAINED_DIR = os.path.join('..', 'data', 'trained')\n",
"LIVE = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tf.config.list_physical_devices('GPU')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tf.__version__"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spectra = np.load(os.path.join(DATA_DIR, 'ravedr6.spectra.npy'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rave = pd.read_csv(os.path.join(DATA_DIR, 'ravedr6.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"labels = pd.read_csv(os.path.join(DATA_DIR, 'ravedr6.apogee.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#nspec_idx = rave.index\n",
"#rand_sel = np.random.choice(range(len(nspec_idx)), 10000, replace=False)\n",
"#spec_sample = spectra[nspec_idx][rand_sel]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spec_sample = spectra[rave[rave.RAVE_OBS_ID.isin(labels.RAVE_OBS_ID)].index]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"spec_sample.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"wl_range = 8446.12176814 + np.cumsum(np.ones(1024) * 0.30025021)\n",
"\n",
"def plot_spectra(spectra, wl_range=wl_range, txt=None):\n",
" nspectra = min(10, len(spectra))\n",
" plt.figure(figsize=(14, 1.5 * nspectra))\n",
" ax = plt.subplot(111)\n",
" for i in range(nspectra):\n",
" ax.plot(wl_range, spectra[i] + i * 0.7, 'k-')\n",
" if txt is not None:\n",
" for i, t in enumerate(txt):\n",
" ax.text(wl_range[20], i * 0.7 + 1.1, t)\n",
" ax.set_ylim(0, 0.7 + 0.7 * nspectra)\n",
" ax.set_xlim(wl_range[0], wl_range[-1])\n",
" y0, y1 = ax.get_ylim()\n",
" for wl in (8498, 8542, 8662):\n",
" ax.plot([wl, wl], [y0, y1], 'k-', alpha=0.3)\n",
" ax.set_xlabel(r'Wavelength $\\mathrm{[\\AA]}$')\n",
" ax.set_ylabel(r'Flux')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Architecture"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![](https://miro.medium.com/max/1000/0*uq2_ZipB9TqI9G_k)\n",
"_Source: https://mc.ai/auto-encoder-in-biology/_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"latent_dim = 2\n",
"\n",
"encoder_ipt = layers.Input(shape=(1024, 1))\n",
"\n",
"x = layers.Convolution1D(16, kernel_size=3, padding='same', name='convolution1')(encoder_ipt)\n",
"x = layers.BatchNormalization()(x)\n",
"x = layers.LeakyReLU()(x)\n",
"x = layers.MaxPooling1D(pool_size=2, padding='same')(x)\n",
" \n",
"x = layers.Convolution1D(32, kernel_size=3, padding='same', name='convolution2')(x)\n",
"x = layers.BatchNormalization()(x)\n",
"x = layers.LeakyReLU()(x)\n",
"x = layers.MaxPooling1D(pool_size=2, padding='same')(x)\n",
"\n",
"x = layers.Flatten()(x)\n",
"\n",
"x = layers.Dense(32, name='dense1')(x)\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"x = layers.Dense(latent_dim, name='dense2')(x)\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"encoder = keras.Model(encoder_ipt, x, name='encoder')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"encoder.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'):\n",
" # Tensorflow v2.3.0 has Conv1DTranspose, but 2.2.0 does not\n",
" x = layers.Lambda(lambda x: K.expand_dims(x, axis=2))(input_tensor)\n",
" x = layers.Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x)\n",
" x = layers.Lambda(lambda x: K.squeeze(x, axis=2))(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"decoder_ipt = layers.Input(shape=(latent_dim,))\n",
"\n",
"x = layers.Dense(32, name='dense3')(decoder_ipt)\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"x = layers.Dense(8192, name='dense4')(x)\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"x = layers.Reshape((256, 32))(x)\n",
"\n",
"x = Conv1DTranspose(x, 16, 3, padding='same')\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"x = Conv1DTranspose(x, 1, 3, padding='same')\n",
"x = layers.LeakyReLU()(x)\n",
"\n",
"decoder = keras.Model(decoder_ipt, x, name='decoder')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"decoder.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[Writing a training loop from scratch](https://keras.io/guides/writing_a_training_loop_from_scratch/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class AutoEncoder(keras.Model):\n",
" def __init__(self, encoder, decoder, **kwargs):\n",
" super(AutoEncoder, self).__init__(**kwargs)\n",
" self.encoder = encoder\n",
" self.decoder = decoder\n",
"\n",
" def train_step(self, data):\n",
" with tf.GradientTape() as tape:\n",
" reconstruction = self.decoder(self.encoder(data))\n",
" loss = tf.reduce_mean(keras.losses.mse(data, reconstruction))\n",
" \n",
" grads = tape.gradient(loss, self.trainable_weights)\n",
" self.optimizer.apply_gradients(zip(grads, self.trainable_weights))\n",
" return {\"loss\": loss}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ae = AutoEncoder(encoder, decoder)\n",
"ae.compile(optimizer='adam')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = np.expand_dims(spec_sample, -1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"if LIVE:\n",
" history = ae.fit(data, epochs=1024, batch_size=32)\n",
"else:\n",
" ae.load_weights(os.path.join(TRAINED_DIR, 'ae.h5'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#ae.save_weights('ae.h5')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"res = encoder.predict(data)"
]
},
{