Commit 7c4dea9f authored by mtjvc's avatar mtjvc
Browse files

remove checkpoints

parent a0a8848f
%% Cell type:markdown id: tags:
# 1. Unsupervised learning - Part 1
%% Cell type:markdown id: tags:
### Literature / Sources
#### Papers
* [Visualizing High-Dimensional Data Using t-SNE, van der Maaten & Hinton, 2008](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)
* [Accelerating t-SNE using Tree-Based Algorithms, van der Maaten, 2014](https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf)
#### Links
* [RAVE Survey](https://www.rave-survey.org/)
* [DmitryUlyanov / Multicore-TSNE](https://github.com/DmitryUlyanov/Multicore-TSNE)
* [scikit-learn](https://scikit-learn.org/stable/)
* [hdbscan](https://hdbscan.readthedocs.io/en/latest/index.html)
%% Cell type:markdown id: tags:
## Imports
%% Cell type:code id: tags:
``` python
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from MulticoreTSNE import MulticoreTSNE as TSNE
import hdbscan
DATA_DIR = os.path.join('..', 'data', 'rave')
LIVE = False
```
%% Cell type:markdown id: tags:
## Data
%% Cell type:code id: tags:
``` python
spectra = np.load(os.path.join(DATA_DIR, 'ravedr6.spectra.npy'))
```
%% Cell type:code id: tags:
``` python
spectra.shape
```
%% Cell type:code id: tags:
``` python
wl_range = 8446.12176814 + np.cumsum(np.ones(1024) * 0.30025021)
```
%% Cell type:code id: tags:
``` python
def plot_spectra(spectra, wl_range=wl_range, txt=None):
nspectra = min(10, len(spectra))
plt.figure(figsize=(14, 1.5 * nspectra))
ax = plt.subplot(111)
for i in range(nspectra):
ax.plot(wl_range, spectra[i] + i * 0.7, 'k-')
if txt is not None:
for i, t in enumerate(txt):
ax.text(wl_range[20], i * 0.7 + 1.1, t)
ax.set_ylim(0, 0.7 + 0.7 * nspectra)
ax.set_xlim(wl_range[0], wl_range[-1])
y0, y1 = ax.get_ylim()
for wl in (8498, 8542, 8662):
ax.plot([wl, wl], [y0, y1], 'k-', alpha=0.3)
ax.set_xlabel(r'Wavelength $\mathrm{[\AA]}$')
ax.set_ylabel(r'Flux')
```
%% Cell type:code id: tags:
``` python
plot_spectra(spectra[np.random.randint(spectra.shape[0], size=10)])
```
%% Cell type:markdown id: tags:
<br><br><br><br><br>
## Dimensionality reduction
%% Cell type:markdown id: tags:
![title](https://static.packt-cdn.com/products/9781789955750/graphics/Images/B13208_01_07.png)
_Source: https://subscription.packtpub.com/book/data/9781789955750/1/ch01lvl1sec03/the-three-different-types-of-machine-learning_
%% Cell type:markdown id: tags:
## Principal Component Analysis (PCA)
%% Cell type:code id: tags:
``` python
spectra_std = StandardScaler().fit_transform(spectra)
#spectra_std = (spectra - np.mean(spectra, axis=0)) / np.std(spectra, axis=0)
```
%% Cell type:code id: tags:
``` python
%%time
pca = PCA(n_components=2)
Ypca = pca.fit_transform(spectra_std)
```
%% Cell type:code id: tags:
``` python
Ypca
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(8, 8))
extent = np.array((-1, 1, -1, 1)) * 18
hb = plt.hexbin(Ypca[:, 0], Ypca[:, 1], gridsize=80, extent=extent, mincnt=1)
plt.axis(extent);
```
%% Cell type:code id: tags:
``` python
%%time
cov = np.cov(spectra_std.T)
eigen_values, eigen_vectors = np.linalg.eig(cov)
Ypca_hm = np.dot(spectra_std, eigen_vectors[:, :2])
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(8, 8))
hb = plt.hexbin(-Ypca_hm[:, 0], -Ypca_hm[:, 1], gridsize=80, mincnt=1, extent=extent)
plt.axis(extent);
```
%% Cell type:code id: tags:
``` python
plot_spectra(spectra[np.argwhere(Ypca[:, 1] > 10)].squeeze())
```
%% Cell type:markdown id: tags:
## t-distributed Stochastic Neighbor Embedding (t-SNE)
%% Cell type:code id: tags:
``` python
%%time
if LIVE:
tsne = TSNE(n_jobs=6)
Y = tsne.fit_transform(spectra[:10000])
else:
# load t-SNE
Y = np.load(os.path.join(DATA_DIR, 'ravedr6.tsne.npy'))
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
extent = np.array((-1, 1, -1, 1)) * 30
hb = plt.hexbin(Y[:, 0], Y[:, 1], gridsize=(80, 40), lw=0.3, extent=extent, mincnt=1)
plt.axis(extent);
plt.colorbar();
```
%% Cell type:markdown id: tags:
## Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBscan)
%% Cell type:code id: tags:
``` python
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=50)
```
%% Cell type:code id: tags:
``` python
clusterer.fit(Y)
```
%% Cell type:code id: tags:
``` python
clusterer.labels_
```
%% Cell type:code id: tags:
``` python
clusterer.labels_.max()
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10, 8))
extent = np.array((-1, 1, -1, 1)) * 30
hb = plt.hexbin(Y[:, 0], Y[:, 1], C=clusterer.labels_, gridsize=(80, 40), lw=0.3, extent=extent,
cmap=plt.cm.get_cmap('gist_ncar', 9), vmin=-1.5, vmax=7.5)
plt.axis(extent)
cbar = plt.colorbar()
cbar.set_label('Labels')
```
%% Cell type:code id: tags:
``` python
sel = spectra[clusterer.labels_ == 1]
plot_spectra(sel[np.random.randint(0, len(sel) + 1, 10)])
```
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(18, 10))
ax = plt.subplot(111)
clusterer.condensed_tree_.plot(select_clusters=True, axis=ax);
```
%% Cell type:code id: tags:
``` python
```