https://www.kdnuggets.com/2018/05/poll-tools-analytics-data-science-machine-learning-results.html
%%HTML
<style>
th,td {
font-size: 18px
}
</style>
import pandas as pd
iris = pd.read_csv("https://datahub.io/machine-learning/iris/r/iris.csv")
iris.head(10)
iris.sample(n=10, random_state=42)
iris.describe()
iris['class'].unique()
iris['class'].value_counts()
iris[iris.sepallength > 6.4].head()
# with this command we do not have to call every time at the end of each cell "plt.show()"
%matplotlib inline
# draw a scatter plot matrix with seaborn
import seaborn as sns
sns.pairplot(iris, hue="class")
iris.boxplot(column='petallength', by = 'class')
# split into data and target variables
iris_data = iris.drop('class', axis=1)
iris_target = iris['class']
# split into train and test set (for later model evaluation)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_target, test_size=0.4, random_state=42, stratify=iris_target)
display("Stratified train split")
display(y_train.value_counts())
display("Stratified test split")
display(y_test.value_counts())
from sklearn import tree
decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
conda install -c conda-forge graphviz
pip install graphviz
(remember to install 'python-graphviz' on Windows 10!)import graphviz
from sklearn.utils.multiclass import unique_labels
dot_data = tree.export_graphviz(decision_tree,feature_names=X_train.columns,class_names=unique_labels(y_train),
filled=True, rounded=True,special_characters=True,out_file=None)
graphviz.Source(dot_data)
y_pred = decision_tree.predict(X_test) # X_test = sepallength sepalwidth petallength petalwidth
print(list(y_pred[:5]))
print(list(y_test[:5]))
import numpy as np
import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, classes):
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], '.2f'),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, unique_labels(y_train))
Dataset | URL |
---|---|
Data from US / UK Government | http://data.gov / http://data.gov.uk/ |
EU Data Portal | https://www.europeandataportal.eu |
CIA World Fact Book | https://www.cia.gov/... |
DataHub | http://datahub.io |
Linked Open Data Cloud | http://lod-cloud.net |
New York Times (starts 1851) | http://developer.nytimes.com/docs |
Amazon datensets (1000 Genome Project, database of satellite imagery, Million Song Dataset) |
http://aws.amazon.com/datasets |
DBPedia | https://wiki.dbpedia.org |
YAGO | https://www.mpi-inf.mpg.de/.../yago/ |
Wikidata | https://www.wikidata.org |
SELECT ?s
WHERE{
?s dbo:populationTotal 311142.
}
city_dataset = pd.DataFrame(data={'city': ['Mannheim', 'Frankfurt', 'Berlin', 'Stuttgart', 'Bremen', 'Bochum', 'Dortmund'], 'target': ['no', 'yes', 'yes', 'no', 'no', 'no', 'no']})
city_dataset
import requests
for i, row in city_dataset.iterrows():
query = """SELECT *
WHERE {{
?s rdfs:label "{}"@en;
geo:lat ?lat;
geo:long ?long;
dbo:populationTotal ?population.
}}
LIMIT 1""".format(row['city'])
#print(query)
r = requests.get('http://dbpedia.org/sparql', params={'query': query, 'format': 'application/sparql-results+json'})
bindings = r.json()['results']['bindings']
if not bindings:
print('could not find position of {}'.format(row['city']))
continue
city_dataset.at[i, 'long'] = bindings[0]['long']['value']
city_dataset.at[i, 'lat'] = bindings[0]['lat']['value']
city_dataset.at[i, 'population'] = bindings[0]['population']['value']
city_dataset
d = city_dataset.drop('target', axis=1).drop('city', axis=1)
p = city_dataset[['target']]
decision_tree.fit(d, p)
dot_data = tree.export_graphviz(decision_tree,
feature_names=d.columns,
class_names=unique_labels(p),
filled=True, rounded=True,special_characters=True,out_file=None)
graphviz.Source(dot_data)
from bs4 import BeautifulSoup
from wordcloud import WordCloud
#get text
response = requests.get('https://hiwissml.github.io/datafest2019/index.html')
soup = BeautifulSoup(response.content, 'lxml')
text = soup.get_text(' ', strip=True).lower()
#create wordcloud:
#conda install -c conda-forge wordcloud
wordcloud = WordCloud(background_color="white", stopwords=['und', 'die', 'der', 'in', 'zu']).generate(text)
plt.figure(figsize = (15,20))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
jupyter notebook available at http://web.informatik.uni-mannheim.de/shertlin/datafest/