add new post
This commit is contained in:
parent
fc295e79d6
commit
5ac3ccf59b
13 changed files with 2298 additions and 25 deletions
2
.vscode/settings.json
vendored
Normal file
2
.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
{
|
||||||
|
}
|
11
Project.toml
Normal file
11
Project.toml
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
[deps]
|
||||||
|
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
|
||||||
|
Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
|
||||||
|
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
|
||||||
|
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
||||||
|
DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964"
|
||||||
|
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
|
||||||
|
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
|
||||||
|
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
|
||||||
|
MbedTLS = "739be429-bea8-5141-9913-cc70e7f3736d"
|
||||||
|
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
|
|
@ -2,12 +2,12 @@
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||||||
|
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
<meta name="generator" content="quarto-1.4.553">
|
<meta name="generator" content="quarto-1.5.56">
|
||||||
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||||
|
|
||||||
|
|
||||||
<title>Kyle Belanger - Posts</title>
|
<title>Posts – Kyle Belanger</title>
|
||||||
<style>
|
<style>
|
||||||
code{white-space: pre-wrap;}
|
code{white-space: pre-wrap;}
|
||||||
span.smallcaps{font-variant: small-caps;}
|
span.smallcaps{font-variant: small-caps;}
|
||||||
|
@ -98,7 +98,7 @@ ul.task-list li input[type="checkbox"] {
|
||||||
})
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
|
<script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
|
||||||
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
|
<script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
|
||||||
|
|
||||||
<script type="text/javascript">
|
<script type="text/javascript">
|
||||||
|
@ -143,7 +143,7 @@ window.Quarto = {
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div id="quarto-search" class="" title="Search"></div>
|
<div id="quarto-search" class="" title="Search"></div>
|
||||||
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" role="menu" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||||||
<span class="navbar-toggler-icon"></span>
|
<span class="navbar-toggler-icon"></span>
|
||||||
</button>
|
</button>
|
||||||
<div class="collapse navbar-collapse" id="navbarCollapse">
|
<div class="collapse navbar-collapse" id="navbarCollapse">
|
||||||
|
@ -163,7 +163,7 @@ window.Quarto = {
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
</div> <!-- /navcollapse -->
|
</div> <!-- /navcollapse -->
|
||||||
<div class="quarto-navbar-tools">
|
<div class="quarto-navbar-tools">
|
||||||
</div>
|
</div>
|
||||||
</div> <!-- /container-fluid -->
|
</div> <!-- /container-fluid -->
|
||||||
</nav>
|
</nav>
|
||||||
|
@ -573,18 +573,7 @@ window.document.addEventListener("DOMContentLoaded", function (event) {
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const clipboard = new window.ClipboardJS('.code-copy-button', {
|
const onCopySuccess = function(e) {
|
||||||
text: function(trigger) {
|
|
||||||
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
|
||||||
for (const childEl of codeEl.children) {
|
|
||||||
if (isCodeAnnotation(childEl)) {
|
|
||||||
childEl.remove();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return codeEl.innerText;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
clipboard.on('success', function(e) {
|
|
||||||
// button target
|
// button target
|
||||||
const button = e.trigger;
|
const button = e.trigger;
|
||||||
// don't keep focus
|
// don't keep focus
|
||||||
|
@ -616,7 +605,29 @@ window.document.addEventListener("DOMContentLoaded", function (event) {
|
||||||
}, 1000);
|
}, 1000);
|
||||||
// clear code selection
|
// clear code selection
|
||||||
e.clearSelection();
|
e.clearSelection();
|
||||||
|
}
|
||||||
|
const getTextToCopy = function(trigger) {
|
||||||
|
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
||||||
|
for (const childEl of codeEl.children) {
|
||||||
|
if (isCodeAnnotation(childEl)) {
|
||||||
|
childEl.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return codeEl.innerText;
|
||||||
|
}
|
||||||
|
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||||||
|
text: getTextToCopy
|
||||||
});
|
});
|
||||||
|
clipboard.on('success', onCopySuccess);
|
||||||
|
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||||||
|
// For code content inside modals, clipBoardJS needs to be initialized with a container option
|
||||||
|
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
|
||||||
|
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||||||
|
text: getTextToCopy,
|
||||||
|
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||||||
|
});
|
||||||
|
clipboardModal.on('success', onCopySuccess);
|
||||||
|
}
|
||||||
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||||||
var mailtoRegex = new RegExp(/^mailto:/);
|
var mailtoRegex = new RegExp(/^mailto:/);
|
||||||
var filterRegex = new RegExp('/' + window.location.host + '/');
|
var filterRegex = new RegExp('/' + window.location.host + '/');
|
||||||
|
@ -624,7 +635,7 @@ window.document.addEventListener("DOMContentLoaded", function (event) {
|
||||||
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||||||
}
|
}
|
||||||
// Inspect non-navigation links and adorn them if external
|
// Inspect non-navigation links and adorn them if external
|
||||||
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool)');
|
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||||||
for (var i=0; i<links.length; i++) {
|
for (var i=0; i<links.length; i++) {
|
||||||
const link = links[i];
|
const link = links[i];
|
||||||
if (!isInternal(link.href)) {
|
if (!isInternal(link.href)) {
|
||||||
|
|
953
_site/posts/2024-08-09-learning-Julia/index.html
Normal file
953
_site/posts/2024-08-09-learning-Julia/index.html
Normal file
|
@ -0,0 +1,953 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||||||
|
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="generator" content="quarto-1.5.56">
|
||||||
|
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||||
|
|
||||||
|
<meta name="author" content="Kyle Belanger">
|
||||||
|
<meta name="dcterms.date" content="2024-08-09">
|
||||||
|
|
||||||
|
<title>Learning Julia by WebScraping Amtrak Data – Kyle Belanger</title>
|
||||||
|
<style>
|
||||||
|
code{white-space: pre-wrap;}
|
||||||
|
span.smallcaps{font-variant: small-caps;}
|
||||||
|
div.columns{display: flex; gap: min(4vw, 1.5em);}
|
||||||
|
div.column{flex: auto; overflow-x: auto;}
|
||||||
|
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||||||
|
ul.task-list{list-style: none;}
|
||||||
|
ul.task-list li input[type="checkbox"] {
|
||||||
|
width: 0.8em;
|
||||||
|
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
|
||||||
|
vertical-align: middle;
|
||||||
|
}
|
||||||
|
/* CSS for syntax highlighting */
|
||||||
|
pre > code.sourceCode { white-space: pre; position: relative; }
|
||||||
|
pre > code.sourceCode > span { line-height: 1.25; }
|
||||||
|
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||||||
|
.sourceCode { overflow: visible; }
|
||||||
|
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||||||
|
div.sourceCode { margin: 1em 0; }
|
||||||
|
pre.sourceCode { margin: 0; }
|
||||||
|
@media screen {
|
||||||
|
div.sourceCode { overflow: auto; }
|
||||||
|
}
|
||||||
|
@media print {
|
||||||
|
pre > code.sourceCode { white-space: pre-wrap; }
|
||||||
|
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
|
||||||
|
}
|
||||||
|
pre.numberSource code
|
||||||
|
{ counter-reset: source-line 0; }
|
||||||
|
pre.numberSource code > span
|
||||||
|
{ position: relative; left: -4em; counter-increment: source-line; }
|
||||||
|
pre.numberSource code > span > a:first-child::before
|
||||||
|
{ content: counter(source-line);
|
||||||
|
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||||||
|
border: none; display: inline-block;
|
||||||
|
-webkit-touch-callout: none; -webkit-user-select: none;
|
||||||
|
-khtml-user-select: none; -moz-user-select: none;
|
||||||
|
-ms-user-select: none; user-select: none;
|
||||||
|
padding: 0 4px; width: 4em;
|
||||||
|
}
|
||||||
|
pre.numberSource { margin-left: 3em; padding-left: 4px; }
|
||||||
|
div.sourceCode
|
||||||
|
{ }
|
||||||
|
@media screen {
|
||||||
|
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-nav/headroom.min.js"></script>
|
||||||
|
<script src="../../site_libs/clipboard/clipboard.min.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-search/fuse.min.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-search/quarto-search.js"></script>
|
||||||
|
<meta name="quarto:offset" content="../../">
|
||||||
|
<script src="../../site_libs/quarto-html/quarto.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-html/popper.min.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||||||
|
<script src="../../site_libs/quarto-html/anchor.min.js"></script>
|
||||||
|
<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||||||
|
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||||||
|
<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
|
||||||
|
<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||||||
|
<link href="../../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
|
||||||
|
<script id="quarto-search-options" type="application/json">{
|
||||||
|
"location": "navbar",
|
||||||
|
"copy-button": false,
|
||||||
|
"collapse-after": 3,
|
||||||
|
"panel-placement": "end",
|
||||||
|
"type": "overlay",
|
||||||
|
"limit": 50,
|
||||||
|
"keyboard-shortcut": [
|
||||||
|
"f",
|
||||||
|
"/",
|
||||||
|
"s"
|
||||||
|
],
|
||||||
|
"show-item-context": false,
|
||||||
|
"language": {
|
||||||
|
"search-no-results-text": "No results",
|
||||||
|
"search-matching-documents-text": "matching documents",
|
||||||
|
"search-copy-link-title": "Copy link to search",
|
||||||
|
"search-hide-matches-text": "Hide additional matches",
|
||||||
|
"search-more-match-text": "more match in this document",
|
||||||
|
"search-more-matches-text": "more matches in this document",
|
||||||
|
"search-clear-button-title": "Clear",
|
||||||
|
"search-text-placeholder": "",
|
||||||
|
"search-detached-cancel-button-title": "Cancel",
|
||||||
|
"search-submit-button-title": "Submit",
|
||||||
|
"search-label": "Search"
|
||||||
|
}
|
||||||
|
}</script>
|
||||||
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
|
||||||
|
|
||||||
|
<meta name="quarto:status" content="draft">
|
||||||
|
|
||||||
|
|
||||||
|
<link rel="stylesheet" href="../../styles.css">
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body class="floating nav-fixed">
|
||||||
|
|
||||||
|
<div id="quarto-search-results"></div>
|
||||||
|
<header id="quarto-header" class="headroom fixed-top quarto-banner"><div id="quarto-draft-alert" class="alert alert-warning"><i class="bi bi-pencil-square"></i>Draft</div>
|
||||||
|
<nav class="navbar navbar-expand-lg " data-bs-theme="dark">
|
||||||
|
<div class="navbar-container container-fluid">
|
||||||
|
<div class="navbar-brand-container mx-auto">
|
||||||
|
<a class="navbar-brand" href="../../index.html">
|
||||||
|
<span class="navbar-title">Kyle Belanger</span>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div id="quarto-search" class="" title="Search"></div>
|
||||||
|
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarCollapse" aria-controls="navbarCollapse" role="menu" aria-expanded="false" aria-label="Toggle navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||||||
|
<span class="navbar-toggler-icon"></span>
|
||||||
|
</button>
|
||||||
|
<div class="collapse navbar-collapse" id="navbarCollapse">
|
||||||
|
<ul class="navbar-nav navbar-nav-scroll ms-auto">
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link" href="../../blog.html">
|
||||||
|
<span class="menu-text">Posts</span></a>
|
||||||
|
</li>
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link" href="../../kyle_resume.pdf">
|
||||||
|
<span class="menu-text">Resume</span></a>
|
||||||
|
</li>
|
||||||
|
<li class="nav-item compact">
|
||||||
|
<a class="nav-link" href="https://github.com/mmmmtoasty19"> <i class="bi bi-github" role="img">
|
||||||
|
</i>
|
||||||
|
<span class="menu-text"></span></a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</div> <!-- /navcollapse -->
|
||||||
|
<div class="quarto-navbar-tools">
|
||||||
|
</div>
|
||||||
|
</div> <!-- /container-fluid -->
|
||||||
|
</nav>
|
||||||
|
</header>
|
||||||
|
<!-- content -->
|
||||||
|
<header id="title-block-header" class="quarto-title-block default toc-left page-columns page-full">
|
||||||
|
<div class="quarto-title-banner page-columns page-full">
|
||||||
|
<div class="quarto-title column-body">
|
||||||
|
<h1 class="title">Learning Julia by WebScraping Amtrak Data</h1>
|
||||||
|
<div class="quarto-categories">
|
||||||
|
<div class="quarto-category">Julia</div>
|
||||||
|
<div class="quarto-category">dataViz</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="quarto-title-meta">
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div class="quarto-title-meta-heading">Author</div>
|
||||||
|
<div class="quarto-title-meta-contents">
|
||||||
|
<p><a href="https://kyleb.rbind.io/">Kyle Belanger</a> </p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div class="quarto-title-meta-heading">Published</div>
|
||||||
|
<div class="quarto-title-meta-contents">
|
||||||
|
<p class="date">August 9, 2024</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
</header><div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
|
||||||
|
<!-- sidebar -->
|
||||||
|
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation floating overflow-auto">
|
||||||
|
<nav id="TOC" role="doc-toc" class="toc-active">
|
||||||
|
<h2 id="toc-title">Table of contents</h2>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li><a href="#load-packages" id="toc-load-packages" class="nav-link active" data-scroll-target="#load-packages">Load Packages</a></li>
|
||||||
|
<li><a href="#setting-up-the-web-scraping" id="toc-setting-up-the-web-scraping" class="nav-link" data-scroll-target="#setting-up-the-web-scraping">Setting up the Web Scraping</a></li>
|
||||||
|
<li><a href="#creating-the-dataframe" id="toc-creating-the-dataframe" class="nav-link" data-scroll-target="#creating-the-dataframe">Creating the DataFrame</a></li>
|
||||||
|
<li><a href="#cleaning-the-dataframe" id="toc-cleaning-the-dataframe" class="nav-link" data-scroll-target="#cleaning-the-dataframe">Cleaning the DataFrame</a></li>
|
||||||
|
<li><a href="#grouping-and-summarizing" id="toc-grouping-and-summarizing" class="nav-link" data-scroll-target="#grouping-and-summarizing">Grouping and Summarizing</a></li>
|
||||||
|
<li><a href="#plotting" id="toc-plotting" class="nav-link" data-scroll-target="#plotting">Plotting</a></li>
|
||||||
|
<li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">Conclusion</a></li>
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</nav>
|
||||||
|
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
|
||||||
|
<!-- margin-sidebar -->
|
||||||
|
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar zindex-bottom">
|
||||||
|
</div>
|
||||||
|
<!-- main -->
|
||||||
|
<main class="content quarto-banner-title-block" id="quarto-document-content">
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<p>Recently two things happened quite close together that started me on the journey to this post.</p>
|
||||||
|
<ol type="1">
|
||||||
|
<li>First I have been planning on fiddling around and learning Julia for a while. I love R and that love will not change but I thought it was good to try something different.</li>
|
||||||
|
<li>My mom took a train and it was super late! I started looking at the station and it seemed like it was always late.</li>
|
||||||
|
</ol>
|
||||||
|
<p>So these two things lead me to this, pulling Amtrak data from the web using Julia. I do not claim to be an expert on Julia but I am learning and I wanted to share my journey, nor to I claim to be an expert at Web Scraping. Taking those things in account lets follow along.</p>
|
||||||
|
<section id="load-packages" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="load-packages">Load Packages</h2>
|
||||||
|
<p>First off I will load the Julia packages I am going to use. The first three all have to do with web scraping, and getting the data off the website. CairoMakie will be used to make the plot. All of the rest are for data wrangling. I already have all of these packages in this project environment so I just need to let the Julia REPL know to load them. If you are brand new to Julia this <a href="https://towardsdatascience.com/how-to-setup-project-environments-in-julia-ec8ae73afe9c">site</a> really helped explain the idea of project environments to me. I also use <a href="https://code.visualstudio.com/">VSCode</a> along with the <a href="https://marketplace.visualstudio.com/items?itemName=julialang.language-julia">Julia extension</a> which does a great job of handling the project environment.</p>
|
||||||
|
<div id="2" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">HTTP</span></span>
|
||||||
|
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">Gumbo</span></span>
|
||||||
|
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">Cascadia</span></span>
|
||||||
|
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">DataFrames</span></span>
|
||||||
|
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">DataFramesMeta</span></span>
|
||||||
|
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">Dates</span></span>
|
||||||
|
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">Statistics</span></span>
|
||||||
|
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">CategoricalArrays</span></span>
|
||||||
|
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="im">using</span> <span class="bu">CairoMakie</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="setting-up-the-web-scraping" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="setting-up-the-web-scraping">Setting up the Web Scraping</h2>
|
||||||
|
<p>Now that the packages are loaded, we can start setting up the web scraping. From my internet searching I found that Amtrak does have an API but it is quite challenging to use. I found this website <a href="https://juckins.net/amtrak_status/archive/html/home.php">Amtrak Status</a> which does a great job of showing the data I was looking for. In this example I am just going to pull data for two trains, train 97 and train 98. You can see in the link I set those as the train numbers, and if you follow the link you will see it sets it up in a nice table to view the historical data. When then use the HTTP package to get the raw website data and then use Gumbo to parse the HTML into a table. The Cascadia package gives the various CSS selectors to help pull the info I want of the entire page. The page table does not have an ids but it is also the only table on the page. I was able to use the CSS Selector “tr” to get each row of the table into a vector. If we examine the third item in the rows vector we see that it has the information we want (the first two rows are headers for the table)</p>
|
||||||
|
<!-- cspell: disable -->
|
||||||
|
<div id="4" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>url <span class="op">=</span> <span class="st">"https://juckins.net/amtrak_status/archive/html/history.php?train_num=97%2C98&station=&date_start=07%2F01%2F2024&date_end=07%2F31%2F2024"</span>;</span>
|
||||||
|
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>resp <span class="op">=</span> HTTP.<span class="fu">get</span>(url);</span>
|
||||||
|
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>page <span class="op">=</span> <span class="fu">parsehtml</span>(<span class="fu">String</span>(resp.body));</span>
|
||||||
|
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>rows <span class="op">=</span> <span class="fu">eachmatch</span>(sel<span class="st">"tr"</span>,page.root);</span>
|
||||||
|
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>rows[<span class="fl">3</span>]</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
</div>
|
||||||
|
<!-- cspell: enable -->
|
||||||
|
</section>
|
||||||
|
<section id="creating-the-dataframe" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="creating-the-dataframe">Creating the DataFrame</h2>
|
||||||
|
<p>Now that each row of the table is stored in a vector we need to rebuild the table into a dataframe in Julia. First I am intializing an empty dataframe by creating each column that will hold data. The column names match those of the header in the table on the website. Then I loop through each item in the rows vector. The text variable is a vector of all the td elements in the row. If the text vector is not empty and has more than one item in it, then we loop through the items and push the text into the row_data vector. Finally we push the row_data vector into the dataframe created prior to the loop. By having the nested if I can remove the footer column at the end of the table from the website. The website table header uses a different CSS selector than the rest of the table but the footer does not. At the end of the loop I now have the same table that is on the website but stored as a dataframe in Julia.</p>
|
||||||
|
<div id="6" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># create empty DataFrame and then populate it with the table from website</span></span>
|
||||||
|
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> <span class="fu">DataFrame</span>(train <span class="op">=</span> <span class="dt">String</span>[], origin_date <span class="op">=</span> [], station <span class="op">=</span> <span class="dt">String</span>[], sch_dp <span class="op">=</span> [], act_dp <span class="op">=</span> <span class="dt">String</span>[], comments <span class="op">=</span> [], s_disrupt <span class="op">=</span> [], cancellations <span class="op">=</span> [])</span>
|
||||||
|
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> rows</span>
|
||||||
|
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> text <span class="op">=</span> <span class="fu">eachmatch</span>(<span class="fu">Selector</span>(<span class="st">"td"</span>), i)</span>
|
||||||
|
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> row_data <span class="op">=</span> []</span>
|
||||||
|
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> !<span class="fu">isempty</span>(text) <span class="op">&&</span> <span class="fu">length</span>(text) <span class="op">></span> <span class="fl">1</span></span>
|
||||||
|
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> item <span class="kw">in</span> text</span>
|
||||||
|
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a> <span class="fu">push!</span>(row_data, <span class="fu">nodeText</span>(item))</span>
|
||||||
|
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">end</span></span>
|
||||||
|
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">push!</span>(df, row_data)</span>
|
||||||
|
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">end</span></span>
|
||||||
|
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="cf">end</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="cleaning-the-dataframe" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="cleaning-the-dataframe">Cleaning the DataFrame</h2>
|
||||||
|
<p>Coming from R I am quite familiar with data cleaning using dpylr and the rest of the tidyverse packages. When looking at options I really liked what the DataFramesMeta package brings, so I have used that here to get the data were I want it. I first filter out any trains that have a service disruption as well as any that are blank in the departure column. Next I select only the station, train, and the comments column. I originally tried using the two departure columns but was having an issue with trains that arrived at the stations on day but then left the next. These were causing the delay to be quite large as it was calculating as if it actually left before arriving. The comments column has what I needed I just had to pull the string out and convert it to a numeric. After selecting the columns I first create the delay column. This pulled the comment string out of the comment column only if it contains Dp: as this indicates how late or early the train left. Next I am pulling out the time in minutes and hours from the delay string and converting those numbers to integers. The total delay column adds the minutes and hours together and if the word late is not in the column it will convert the number to negative. A negative delay in this case means the train left early. Finally I transform the columns to categorical so that they are easier to work with in the future. You can notice that for the last transformation I could not figure out how to select two columns using the transform macro. Also for those coming from R note the .=> this is the broadcast operator and it lets Julia know to perform the action on the entire vector (I think I am explaining this right!) I end the block by showing the first 5 rows of the modified dataframe.</p>
|
||||||
|
<div id="8" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>mod_df <span class="op">=</span> <span class="pp">@chain</span> df <span class="cf">begin</span></span>
|
||||||
|
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rsubset</span> <span class="op">:</span>act_dp <span class="op">!=</span> <span class="st">""</span> <span class="op">&&</span> <span class="op">:</span>s_disrupt <span class="op">!=</span> <span class="st">"SD"</span></span>
|
||||||
|
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a> <span class="pp">@select</span> <span class="op">:</span>train <span class="op">:</span>station <span class="op">:</span>comments</span>
|
||||||
|
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a> <span class="co">#can't perform match if there is nothing there</span></span>
|
||||||
|
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rtransform</span> <span class="op">:</span>delay <span class="op">=</span> <span class="fu">occursin</span>(<span class="st">r"Dp:"</span>, <span class="op">:</span>comments) ? <span class="fu">match</span>(<span class="st">r"Dp:</span><span class="sc">.*</span><span class="st">"</span>, <span class="op">:</span>comments).match <span class="op">:</span> <span class="st">""</span></span>
|
||||||
|
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rtransform</span> <span class="op">:</span>min <span class="op">=</span> <span class="fu">occursin</span>(<span class="st">r"min"</span>, <span class="op">:</span>delay) ? <span class="fu">parse</span>(<span class="dt">Int</span>,<span class="fu">match</span>(<span class="st">r"</span><span class="ch">([</span><span class="st">0-9</span><span class="ch">]</span><span class="sc">*</span><span class="ch">)</span><span class="st"> min"</span>, <span class="op">:</span>delay)[<span class="fl">1</span>]) <span class="op">:</span> <span class="fu">Int</span>(<span class="fl">0</span>)</span>
|
||||||
|
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rtransform</span> <span class="op">:</span>hour <span class="op">=</span> <span class="fu">occursin</span>(<span class="st">r"hr"</span>, <span class="op">:</span>delay) ? <span class="fu">parse</span>(<span class="dt">Int</span>,<span class="fu">match</span>(<span class="st">r"</span><span class="ch">([</span><span class="st">0-9</span><span class="ch">]</span><span class="sc">*</span><span class="ch">)</span><span class="st"> hr"</span>, <span class="op">:</span>delay)[<span class="fl">1</span>]) <span class="op">*</span><span class="fl">60</span> <span class="op">:</span> <span class="fu">Int</span>(<span class="fl">0</span>)</span>
|
||||||
|
<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rtransform</span> <span class="op">:</span>total_delay_mins <span class="op">=</span> <span class="op">:</span>min <span class="op">+</span> <span class="op">:</span>hour <span class="op">|></span> x <span class="op">-></span> <span class="fu">occursin</span>(<span class="st">r"late"</span>, <span class="op">:</span>delay) ? x <span class="op">:</span> x <span class="op">*-</span><span class="fl">1</span> <span class="co">#if word late does not appear, train left early</span></span>
|
||||||
|
<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">transform</span>([<span class="op">:</span>station, <span class="op">:</span>train] <span class="op">.=></span> categorical, renamecols <span class="op">=</span> <span class="cn">false</span>)</span>
|
||||||
|
<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="cf">end</span></span>
|
||||||
|
<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a><span class="fu">first</span>(mod_df, <span class="fl">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
<div class="cell-output cell-output-display" data-execution_count="1">
|
||||||
|
<div><div style="float: left;"><span>5×7 DataFrame</span></div><div style="clear: both;"></div></div><div class="data-frame" style="overflow-x: scroll;">
|
||||||
|
<table class="data-frame caption-top table table-sm table-striped small" data-quarto-postprocess="true">
|
||||||
|
<thead>
|
||||||
|
<tr class="header">
|
||||||
|
<th class="rowNumber" data-quarto-table-cell-role="th" style="text-align: right; font-weight: bold;">Row</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">train</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">station</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">comments</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">delay</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">min</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">hour</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">total_delay_mins</th>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd subheader headerLastRow">
|
||||||
|
<th class="rowNumber" data-quarto-table-cell-role="th" style="text-align: right; font-weight: bold;"></th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="CategoricalArrays.CategoricalValue{String, UInt32}">Cat…</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="CategoricalArrays.CategoricalValue{String, UInt32}">Cat…</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Any">Any</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="AbstractString">Abstract…</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">1</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">RMT</td>
|
||||||
|
<td style="text-align: left;">Dp: 1 min late.</td>
|
||||||
|
<td style="text-align: left;">Dp: 1 min late.</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="even">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">2</td>
|
||||||
|
<td style="text-align: left;">98</td>
|
||||||
|
<td style="text-align: left;">FLO</td>
|
||||||
|
<td style="text-align: left;">Ar: 7 min early. | Dp: On time.</td>
|
||||||
|
<td style="text-align: left;">Dp: On time.</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">3</td>
|
||||||
|
<td style="text-align: left;">98</td>
|
||||||
|
<td style="text-align: left;">KTR</td>
|
||||||
|
<td style="text-align: left;">Dp: 12 min late.</td>
|
||||||
|
<td style="text-align: left;">Dp: 12 min late.</td>
|
||||||
|
<td style="text-align: right;">12</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">12</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="even">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">4</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">PTB</td>
|
||||||
|
<td style="text-align: left;">Dp: 6 min late.</td>
|
||||||
|
<td style="text-align: left;">Dp: 6 min late.</td>
|
||||||
|
<td style="text-align: right;">6</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">6</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">5</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">RVR</td>
|
||||||
|
<td style="text-align: left;">Ar: 8 min late. | Dp: 5 min late.</td>
|
||||||
|
<td style="text-align: left;">Dp: 5 min late.</td>
|
||||||
|
<td style="text-align: right;">5</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">5</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="grouping-and-summarizing" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="grouping-and-summarizing">Grouping and Summarizing</h2>
|
||||||
|
<p>Now that I have the data I want, I want to group and summarize to create some graphs. Again using DataFramesMeta and the by keyword I can group by the train and station columns and then create the mean, median, max, and min columns. This action felt very to summarize in dplyr. DataFramesMeta does allow you to do the grouping and combining as two separate steps, but the by keyword combines in into one step. I then ordered by the station column and then by the train column. I then created a column that shows the difference in the mean delay between the two trains. I didn’t end up using this for now but I might make something with it later. Last I created two columns that contain the level code for the station and train columns. I will talk about the reason for this in the next section. The function levelcode is from the CategoricalArrays package and it creates an integer column that matches the level of the categorical name. Last I display the first 5 rows of the dataframe.</p>
|
||||||
|
<div id="10" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>gd <span class="op">=</span> <span class="pp">@chain</span> mod_df <span class="cf">begin</span></span>
|
||||||
|
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a> <span class="pp">@by</span> _ [<span class="op">:</span>train,<span class="op">:</span>station] <span class="cf">begin</span></span>
|
||||||
|
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>mean <span class="op">=</span> <span class="dt">Float32</span>[<span class="bu">Statistics</span>.<span class="fu">mean</span>(<span class="op">:</span>total_delay_mins)]</span>
|
||||||
|
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>median <span class="op">=</span> <span class="bu">Statistics</span>.<span class="fu">median</span>(<span class="op">:</span>total_delay_mins)</span>
|
||||||
|
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>max <span class="op">=</span> <span class="fu">maximum</span>(<span class="op">:</span>total_delay_mins)</span>
|
||||||
|
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>min <span class="op">=</span> <span class="fu">minimum</span>(<span class="op">:</span>total_delay_mins) </span>
|
||||||
|
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">end</span> </span>
|
||||||
|
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a> <span class="pp">@orderby</span> <span class="op">:</span>station <span class="op">:</span>train</span>
|
||||||
|
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a> <span class="pp">@groupby</span> <span class="op">:</span>station</span>
|
||||||
|
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a> <span class="pp">@transform</span> <span class="op">:</span>diff <span class="op">=</span> [<span class="cn">missing</span>; <span class="fu">diff</span>(<span class="op">:</span>mean)]</span>
|
||||||
|
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a> <span class="pp">@rtransform</span> _ <span class="cf">begin</span></span>
|
||||||
|
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>station_code <span class="op">=</span> <span class="fu">levelcode</span>(<span class="op">:</span>station)</span>
|
||||||
|
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a> <span class="op">:</span>train_code <span class="op">=</span> <span class="fu">levelcode</span>(<span class="op">:</span>train)</span>
|
||||||
|
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a> <span class="cf">end</span></span>
|
||||||
|
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="cf">end</span></span>
|
||||||
|
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="fu">first</span>(gd, <span class="fl">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
<div class="cell-output cell-output-display" data-execution_count="1">
|
||||||
|
<div><div style="float: left;"><span>5×9 DataFrame</span></div><div style="clear: both;"></div></div><div class="data-frame" style="overflow-x: scroll;">
|
||||||
|
<table class="data-frame caption-top table table-sm table-striped small" data-quarto-postprocess="true">
|
||||||
|
<thead>
|
||||||
|
<tr class="header">
|
||||||
|
<th class="rowNumber" data-quarto-table-cell-role="th" style="text-align: right; font-weight: bold;">Row</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">train</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">station</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">mean</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">median</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">max</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">min</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">diff</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">station_code</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th">train_code</th>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd subheader headerLastRow">
|
||||||
|
<th class="rowNumber" data-quarto-table-cell-role="th" style="text-align: right; font-weight: bold;"></th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="CategoricalArrays.CategoricalValue{String, UInt32}">Cat…</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="CategoricalArrays.CategoricalValue{String, UInt32}">Cat…</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Float32">Float32</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Float64">Float64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Union{Missing, Float32}">Float32?</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
<th style="text-align: left;" data-quarto-table-cell-role="th" title="Int64">Int64</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">1</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">ALX</td>
|
||||||
|
<td style="text-align: right;">70.4</td>
|
||||||
|
<td style="text-align: right;">50.0</td>
|
||||||
|
<td style="text-align: right;">287</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right; font-style: italic;">missing</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="even">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">2</td>
|
||||||
|
<td style="text-align: left;">98</td>
|
||||||
|
<td style="text-align: left;">ALX</td>
|
||||||
|
<td style="text-align: right;">101.387</td>
|
||||||
|
<td style="text-align: right;">77.0</td>
|
||||||
|
<td style="text-align: right;">399</td>
|
||||||
|
<td style="text-align: right;">-16</td>
|
||||||
|
<td style="text-align: right;">30.9871</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
<td style="text-align: right;">2</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">3</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">BAL</td>
|
||||||
|
<td style="text-align: right;">53.3333</td>
|
||||||
|
<td style="text-align: right;">27.0</td>
|
||||||
|
<td style="text-align: right;">267</td>
|
||||||
|
<td style="text-align: right;">3</td>
|
||||||
|
<td style="text-align: right; font-style: italic;">missing</td>
|
||||||
|
<td style="text-align: right;">2</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="even">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">4</td>
|
||||||
|
<td style="text-align: left;">98</td>
|
||||||
|
<td style="text-align: left;">BAL</td>
|
||||||
|
<td style="text-align: right;">120.226</td>
|
||||||
|
<td style="text-align: right;">104.0</td>
|
||||||
|
<td style="text-align: right;">414</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right;">66.8925</td>
|
||||||
|
<td style="text-align: right;">2</td>
|
||||||
|
<td style="text-align: right;">2</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="odd">
|
||||||
|
<td class="rowNumber" style="text-align: right; font-weight: bold;">5</td>
|
||||||
|
<td style="text-align: left;">97</td>
|
||||||
|
<td style="text-align: left;">CHS</td>
|
||||||
|
<td style="text-align: right;">71.1</td>
|
||||||
|
<td style="text-align: right;">53.0</td>
|
||||||
|
<td style="text-align: right;">286</td>
|
||||||
|
<td style="text-align: right;">0</td>
|
||||||
|
<td style="text-align: right; font-style: italic;">missing</td>
|
||||||
|
<td style="text-align: right;">3</td>
|
||||||
|
<td style="text-align: right;">1</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="plotting" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="plotting">Plotting</h2>
|
||||||
|
<p>Coming from R and the ggplot package (also having played around a bit in Plotly for R) there was a rather step learning curve to Makie! I do feel there is a ton of flexibility in Makie, but learning to use it is a beast, and was probably the hardest part of this whole thing. The first challenge was Makie does not like categorical variables (at least for barplots, don’t know if this is always true), thus the need for using the level codes so I could pass a numerical vector to the x axis. I am then able to label that axis with the categorical labels. Makie does also allow you to just call the barplot function without all the other set up, and it will automatically create the figure and axis, however I wanted to do it manually and really build up the graph. First step was setting a color gradient, I used Dark2 from the ColorBrewer schemes, just as a personal preference for one I really like. Next up I create the figure. Directly from the Makie docs, The Figure is the outermost container object. I could pass some arguments to the Figure constructor, and change size or colors, but for this one I just left everything as the defaults. Next up is creating the axis. I placed it at position 1,1 within the previously created figure. I also pass labels for the x and y axis, a title, and then the labels for the xticks. The label roation is in radian so pi/2 rotates the labels 90 degrees. Next I generate the barplot. Not the ! in the function call allows for plotting on an existing axis. (<a href="https://docs.julialang.org/en/v1/manual/style-guide/#bang-convention">More info on the Bang Operator</a>) Last I set up Labels and Colors for the Legend, and the place the Legend at position 1,2 of the existing figure.</p>
|
||||||
|
<div id="12" class="cell" data-execution_count="1">
|
||||||
|
<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode julia code-with-copy"><code class="sourceCode julia"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>colors <span class="op">=</span> <span class="fu">cgrad</span>(<span class="op">:</span>Dark2_6)</span>
|
||||||
|
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>f <span class="op">=</span> <span class="fu">Figure</span>();</span>
|
||||||
|
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>ax <span class="op">=</span> <span class="fu">Axis</span>(f[<span class="fl">1</span>,<span class="fl">1</span>], xlabel <span class="op">=</span> <span class="st">"Station"</span>, ylabel <span class="op">=</span> <span class="st">"Mean Delay (mins)"</span>, title <span class="op">=</span> <span class="st">"Mean Delay by Station"</span>, xticks <span class="op">=</span> (<span class="fl">1</span><span class="op">:</span><span class="fu">length</span>(<span class="fu">levels</span>(gd.station_code)), <span class="fu">levels</span>(gd.station)), xticklabelrotation <span class="op">=</span> <span class="cn">pi</span><span class="op">/</span><span class="fl">2</span>)</span>
|
||||||
|
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="fu">barplot!</span>(ax, gd.station_code, gd.mean, dodge <span class="op">=</span> gd.train_code, color <span class="op">=</span> colors[gd.train_code]) </span>
|
||||||
|
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>labels <span class="op">=</span> [<span class="st">"</span><span class="sc">$</span>i<span class="st">"</span> for i <span class="kw">in</span> <span class="fu">unique</span>(gd.train)]</span>
|
||||||
|
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>elements <span class="op">=</span> [<span class="fu">PolyElement</span>(polycolor <span class="op">=</span> colors[i]) for i <span class="kw">in</span> <span class="fu">unique</span>(gd.train_code)]</span>
|
||||||
|
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="fu">Legend</span>(f[<span class="fl">1</span>,<span class="fl">2</span>],elements, labels, <span class="st">"Train Number"</span>)</span>
|
||||||
|
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a></span>
|
||||||
|
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a>f</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||||
|
<div class="cell-output cell-output-display" data-execution_count="1">
|
||||||
|
<div>
|
||||||
|
<figure class="figure">
|
||||||
|
<p><img src="index_files/figure-html/cell-7-output-1.svg" class="img-fluid figure-img"></p>
|
||||||
|
</figure>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section id="conclusion" class="level2">
|
||||||
|
<h2 class="anchored" data-anchor-id="conclusion">Conclusion</h2>
|
||||||
|
<p>There is still a lot that could be done with this data set, and I am interested to keep playing around with it to see what kind of insights I could gather. Overall I learned a lot about Julia but as I learned with R there is always more to learn! I look forward to see where this journey takes me.</p>
|
||||||
|
|
||||||
|
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" id="quarto-reuse"><h2 class="anchored quarto-appendix-heading">Reuse</h2><div class="quarto-appendix-contents"><div><a rel="license" href="https://creativecommons.org/licenses/by/4.0/">CC BY 4.0</a></div></div></section><section class="quarto-appendix-contents" id="quarto-citation"><h2 class="anchored quarto-appendix-heading">Citation</h2><div><div class="quarto-appendix-secondary-label">BibTeX citation:</div><pre class="sourceCode code-with-copy quarto-appendix-bibtex"><code class="sourceCode bibtex">@online{belanger2024,
|
||||||
|
author = {Belanger, Kyle},
|
||||||
|
title = {Learning {Julia} by {WebScraping} {Amtrak} {Data}},
|
||||||
|
date = {2024-08-09},
|
||||||
|
langid = {en}
|
||||||
|
}
|
||||||
|
</code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre><div class="quarto-appendix-secondary-label">For attribution, please cite this work as:</div><div id="ref-belanger2024" class="csl-entry quarto-appendix-citeas" role="listitem">
|
||||||
|
Belanger, Kyle. 2024. <span>“Learning Julia by WebScraping Amtrak
|
||||||
|
Data.”</span> August 9, 2024.
|
||||||
|
</div></div></section></div></main> <!-- /main -->
|
||||||
|
<script id="quarto-html-after-body" type="application/javascript">
|
||||||
|
window.document.addEventListener("DOMContentLoaded", function (event) {
|
||||||
|
const toggleBodyColorMode = (bsSheetEl) => {
|
||||||
|
const mode = bsSheetEl.getAttribute("data-mode");
|
||||||
|
const bodyEl = window.document.querySelector("body");
|
||||||
|
if (mode === "dark") {
|
||||||
|
bodyEl.classList.add("quarto-dark");
|
||||||
|
bodyEl.classList.remove("quarto-light");
|
||||||
|
} else {
|
||||||
|
bodyEl.classList.add("quarto-light");
|
||||||
|
bodyEl.classList.remove("quarto-dark");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const toggleBodyColorPrimary = () => {
|
||||||
|
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
|
||||||
|
if (bsSheetEl) {
|
||||||
|
toggleBodyColorMode(bsSheetEl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
toggleBodyColorPrimary();
|
||||||
|
const icon = "";
|
||||||
|
const anchorJS = new window.AnchorJS();
|
||||||
|
anchorJS.options = {
|
||||||
|
placement: 'right',
|
||||||
|
icon: icon
|
||||||
|
};
|
||||||
|
anchorJS.add('.anchored');
|
||||||
|
const isCodeAnnotation = (el) => {
|
||||||
|
for (const clz of el.classList) {
|
||||||
|
if (clz.startsWith('code-annotation-')) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const onCopySuccess = function(e) {
|
||||||
|
// button target
|
||||||
|
const button = e.trigger;
|
||||||
|
// don't keep focus
|
||||||
|
button.blur();
|
||||||
|
// flash "checked"
|
||||||
|
button.classList.add('code-copy-button-checked');
|
||||||
|
var currentTitle = button.getAttribute("title");
|
||||||
|
button.setAttribute("title", "Copied!");
|
||||||
|
let tooltip;
|
||||||
|
if (window.bootstrap) {
|
||||||
|
button.setAttribute("data-bs-toggle", "tooltip");
|
||||||
|
button.setAttribute("data-bs-placement", "left");
|
||||||
|
button.setAttribute("data-bs-title", "Copied!");
|
||||||
|
tooltip = new bootstrap.Tooltip(button,
|
||||||
|
{ trigger: "manual",
|
||||||
|
customClass: "code-copy-button-tooltip",
|
||||||
|
offset: [0, -8]});
|
||||||
|
tooltip.show();
|
||||||
|
}
|
||||||
|
setTimeout(function() {
|
||||||
|
if (tooltip) {
|
||||||
|
tooltip.hide();
|
||||||
|
button.removeAttribute("data-bs-title");
|
||||||
|
button.removeAttribute("data-bs-toggle");
|
||||||
|
button.removeAttribute("data-bs-placement");
|
||||||
|
}
|
||||||
|
button.setAttribute("title", currentTitle);
|
||||||
|
button.classList.remove('code-copy-button-checked');
|
||||||
|
}, 1000);
|
||||||
|
// clear code selection
|
||||||
|
e.clearSelection();
|
||||||
|
}
|
||||||
|
const getTextToCopy = function(trigger) {
|
||||||
|
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
||||||
|
for (const childEl of codeEl.children) {
|
||||||
|
if (isCodeAnnotation(childEl)) {
|
||||||
|
childEl.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return codeEl.innerText;
|
||||||
|
}
|
||||||
|
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||||||
|
text: getTextToCopy
|
||||||
|
});
|
||||||
|
clipboard.on('success', onCopySuccess);
|
||||||
|
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||||||
|
// For code content inside modals, clipBoardJS needs to be initialized with a container option
|
||||||
|
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
|
||||||
|
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||||||
|
text: getTextToCopy,
|
||||||
|
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||||||
|
});
|
||||||
|
clipboardModal.on('success', onCopySuccess);
|
||||||
|
}
|
||||||
|
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||||||
|
var mailtoRegex = new RegExp(/^mailto:/);
|
||||||
|
var filterRegex = new RegExp('/' + window.location.host + '/');
|
||||||
|
var isInternal = (href) => {
|
||||||
|
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||||||
|
}
|
||||||
|
// Inspect non-navigation links and adorn them if external
|
||||||
|
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||||||
|
for (var i=0; i<links.length; i++) {
|
||||||
|
const link = links[i];
|
||||||
|
if (!isInternal(link.href)) {
|
||||||
|
// undo the damage that might have been done by quarto-nav.js in the case of
|
||||||
|
// links that we want to consider external
|
||||||
|
if (link.dataset.originalHref !== undefined) {
|
||||||
|
link.href = link.dataset.originalHref;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
|
||||||
|
const config = {
|
||||||
|
allowHTML: true,
|
||||||
|
maxWidth: 500,
|
||||||
|
delay: 100,
|
||||||
|
arrow: false,
|
||||||
|
appendTo: function(el) {
|
||||||
|
return el.parentElement;
|
||||||
|
},
|
||||||
|
interactive: true,
|
||||||
|
interactiveBorder: 10,
|
||||||
|
theme: 'quarto',
|
||||||
|
placement: 'bottom-start',
|
||||||
|
};
|
||||||
|
if (contentFn) {
|
||||||
|
config.content = contentFn;
|
||||||
|
}
|
||||||
|
if (onTriggerFn) {
|
||||||
|
config.onTrigger = onTriggerFn;
|
||||||
|
}
|
||||||
|
if (onUntriggerFn) {
|
||||||
|
config.onUntrigger = onUntriggerFn;
|
||||||
|
}
|
||||||
|
window.tippy(el, config);
|
||||||
|
}
|
||||||
|
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
|
||||||
|
for (var i=0; i<noterefs.length; i++) {
|
||||||
|
const ref = noterefs[i];
|
||||||
|
tippyHover(ref, function() {
|
||||||
|
// use id or data attribute instead here
|
||||||
|
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
|
||||||
|
try { href = new URL(href).hash; } catch {}
|
||||||
|
const id = href.replace(/^#\/?/, "");
|
||||||
|
const note = window.document.getElementById(id);
|
||||||
|
if (note) {
|
||||||
|
return note.innerHTML;
|
||||||
|
} else {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const xrefs = window.document.querySelectorAll('a.quarto-xref');
|
||||||
|
const processXRef = (id, note) => {
|
||||||
|
// Strip column container classes
|
||||||
|
const stripColumnClz = (el) => {
|
||||||
|
el.classList.remove("page-full", "page-columns");
|
||||||
|
if (el.children) {
|
||||||
|
for (const child of el.children) {
|
||||||
|
stripColumnClz(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stripColumnClz(note)
|
||||||
|
if (id === null || id.startsWith('sec-')) {
|
||||||
|
// Special case sections, only their first couple elements
|
||||||
|
const container = document.createElement("div");
|
||||||
|
if (note.children && note.children.length > 2) {
|
||||||
|
container.appendChild(note.children[0].cloneNode(true));
|
||||||
|
for (let i = 1; i < note.children.length; i++) {
|
||||||
|
const child = note.children[i];
|
||||||
|
if (child.tagName === "P" && child.innerText === "") {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
container.appendChild(child.cloneNode(true));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (window.Quarto?.typesetMath) {
|
||||||
|
window.Quarto.typesetMath(container);
|
||||||
|
}
|
||||||
|
return container.innerHTML
|
||||||
|
} else {
|
||||||
|
if (window.Quarto?.typesetMath) {
|
||||||
|
window.Quarto.typesetMath(note);
|
||||||
|
}
|
||||||
|
return note.innerHTML;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Remove any anchor links if they are present
|
||||||
|
const anchorLink = note.querySelector('a.anchorjs-link');
|
||||||
|
if (anchorLink) {
|
||||||
|
anchorLink.remove();
|
||||||
|
}
|
||||||
|
if (window.Quarto?.typesetMath) {
|
||||||
|
window.Quarto.typesetMath(note);
|
||||||
|
}
|
||||||
|
// TODO in 1.5, we should make sure this works without a callout special case
|
||||||
|
if (note.classList.contains("callout")) {
|
||||||
|
return note.outerHTML;
|
||||||
|
} else {
|
||||||
|
return note.innerHTML;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (var i=0; i<xrefs.length; i++) {
|
||||||
|
const xref = xrefs[i];
|
||||||
|
tippyHover(xref, undefined, function(instance) {
|
||||||
|
instance.disable();
|
||||||
|
let url = xref.getAttribute('href');
|
||||||
|
let hash = undefined;
|
||||||
|
if (url.startsWith('#')) {
|
||||||
|
hash = url;
|
||||||
|
} else {
|
||||||
|
try { hash = new URL(url).hash; } catch {}
|
||||||
|
}
|
||||||
|
if (hash) {
|
||||||
|
const id = hash.replace(/^#\/?/, "");
|
||||||
|
const note = window.document.getElementById(id);
|
||||||
|
if (note !== null) {
|
||||||
|
try {
|
||||||
|
const html = processXRef(id, note.cloneNode(true));
|
||||||
|
instance.setContent(html);
|
||||||
|
} finally {
|
||||||
|
instance.enable();
|
||||||
|
instance.show();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// See if we can fetch this
|
||||||
|
fetch(url.split('#')[0])
|
||||||
|
.then(res => res.text())
|
||||||
|
.then(html => {
|
||||||
|
const parser = new DOMParser();
|
||||||
|
const htmlDoc = parser.parseFromString(html, "text/html");
|
||||||
|
const note = htmlDoc.getElementById(id);
|
||||||
|
if (note !== null) {
|
||||||
|
const html = processXRef(id, note);
|
||||||
|
instance.setContent(html);
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
instance.enable();
|
||||||
|
instance.show();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// See if we can fetch a full url (with no hash to target)
|
||||||
|
// This is a special case and we should probably do some content thinning / targeting
|
||||||
|
fetch(url)
|
||||||
|
.then(res => res.text())
|
||||||
|
.then(html => {
|
||||||
|
const parser = new DOMParser();
|
||||||
|
const htmlDoc = parser.parseFromString(html, "text/html");
|
||||||
|
const note = htmlDoc.querySelector('main.content');
|
||||||
|
if (note !== null) {
|
||||||
|
// This should only happen for chapter cross references
|
||||||
|
// (since there is no id in the URL)
|
||||||
|
// remove the first header
|
||||||
|
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
|
||||||
|
note.children[0].remove();
|
||||||
|
}
|
||||||
|
const html = processXRef(null, note);
|
||||||
|
instance.setContent(html);
|
||||||
|
}
|
||||||
|
}).finally(() => {
|
||||||
|
instance.enable();
|
||||||
|
instance.show();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, function(instance) {
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let selectedAnnoteEl;
|
||||||
|
const selectorForAnnotation = ( cell, annotation) => {
|
||||||
|
let cellAttr = 'data-code-cell="' + cell + '"';
|
||||||
|
let lineAttr = 'data-code-annotation="' + annotation + '"';
|
||||||
|
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
|
||||||
|
return selector;
|
||||||
|
}
|
||||||
|
const selectCodeLines = (annoteEl) => {
|
||||||
|
const doc = window.document;
|
||||||
|
const targetCell = annoteEl.getAttribute("data-target-cell");
|
||||||
|
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
|
||||||
|
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
|
||||||
|
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
|
||||||
|
const lineIds = lines.map((line) => {
|
||||||
|
return targetCell + "-" + line;
|
||||||
|
})
|
||||||
|
let top = null;
|
||||||
|
let height = null;
|
||||||
|
let parent = null;
|
||||||
|
if (lineIds.length > 0) {
|
||||||
|
//compute the position of the single el (top and bottom and make a div)
|
||||||
|
const el = window.document.getElementById(lineIds[0]);
|
||||||
|
top = el.offsetTop;
|
||||||
|
height = el.offsetHeight;
|
||||||
|
parent = el.parentElement.parentElement;
|
||||||
|
if (lineIds.length > 1) {
|
||||||
|
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
|
||||||
|
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
|
||||||
|
height = bottom - top;
|
||||||
|
}
|
||||||
|
if (top !== null && height !== null && parent !== null) {
|
||||||
|
// cook up a div (if necessary) and position it
|
||||||
|
let div = window.document.getElementById("code-annotation-line-highlight");
|
||||||
|
if (div === null) {
|
||||||
|
div = window.document.createElement("div");
|
||||||
|
div.setAttribute("id", "code-annotation-line-highlight");
|
||||||
|
div.style.position = 'absolute';
|
||||||
|
parent.appendChild(div);
|
||||||
|
}
|
||||||
|
div.style.top = top - 2 + "px";
|
||||||
|
div.style.height = height + 4 + "px";
|
||||||
|
div.style.left = 0;
|
||||||
|
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
|
||||||
|
if (gutterDiv === null) {
|
||||||
|
gutterDiv = window.document.createElement("div");
|
||||||
|
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
|
||||||
|
gutterDiv.style.position = 'absolute';
|
||||||
|
const codeCell = window.document.getElementById(targetCell);
|
||||||
|
const gutter = codeCell.querySelector('.code-annotation-gutter');
|
||||||
|
gutter.appendChild(gutterDiv);
|
||||||
|
}
|
||||||
|
gutterDiv.style.top = top - 2 + "px";
|
||||||
|
gutterDiv.style.height = height + 4 + "px";
|
||||||
|
}
|
||||||
|
selectedAnnoteEl = annoteEl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
const unselectCodeLines = () => {
|
||||||
|
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
|
||||||
|
elementsIds.forEach((elId) => {
|
||||||
|
const div = window.document.getElementById(elId);
|
||||||
|
if (div) {
|
||||||
|
div.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
selectedAnnoteEl = undefined;
|
||||||
|
};
|
||||||
|
// Handle positioning of the toggle
|
||||||
|
window.addEventListener(
|
||||||
|
"resize",
|
||||||
|
throttle(() => {
|
||||||
|
elRect = undefined;
|
||||||
|
if (selectedAnnoteEl) {
|
||||||
|
selectCodeLines(selectedAnnoteEl);
|
||||||
|
}
|
||||||
|
}, 10)
|
||||||
|
);
|
||||||
|
function throttle(fn, ms) {
|
||||||
|
let throttle = false;
|
||||||
|
let timer;
|
||||||
|
return (...args) => {
|
||||||
|
if(!throttle) { // first call gets through
|
||||||
|
fn.apply(this, args);
|
||||||
|
throttle = true;
|
||||||
|
} else { // all the others get throttled
|
||||||
|
if(timer) clearTimeout(timer); // cancel #2
|
||||||
|
timer = setTimeout(() => {
|
||||||
|
fn.apply(this, args);
|
||||||
|
timer = throttle = false;
|
||||||
|
}, ms);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// Attach click handler to the DT
|
||||||
|
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
|
||||||
|
for (const annoteDlNode of annoteDls) {
|
||||||
|
annoteDlNode.addEventListener('click', (event) => {
|
||||||
|
const clickedEl = event.target;
|
||||||
|
if (clickedEl !== selectedAnnoteEl) {
|
||||||
|
unselectCodeLines();
|
||||||
|
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
|
||||||
|
if (activeEl) {
|
||||||
|
activeEl.classList.remove('code-annotation-active');
|
||||||
|
}
|
||||||
|
selectCodeLines(clickedEl);
|
||||||
|
clickedEl.classList.add('code-annotation-active');
|
||||||
|
} else {
|
||||||
|
// Unselect the line
|
||||||
|
unselectCodeLines();
|
||||||
|
clickedEl.classList.remove('code-annotation-active');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
const findCites = (el) => {
|
||||||
|
const parentEl = el.parentElement;
|
||||||
|
if (parentEl) {
|
||||||
|
const cites = parentEl.dataset.cites;
|
||||||
|
if (cites) {
|
||||||
|
return {
|
||||||
|
el,
|
||||||
|
cites: cites.split(' ')
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return findCites(el.parentElement)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
|
||||||
|
for (var i=0; i<bibliorefs.length; i++) {
|
||||||
|
const ref = bibliorefs[i];
|
||||||
|
const citeInfo = findCites(ref);
|
||||||
|
if (citeInfo) {
|
||||||
|
tippyHover(citeInfo.el, function() {
|
||||||
|
var popup = window.document.createElement('div');
|
||||||
|
citeInfo.cites.forEach(function(cite) {
|
||||||
|
var citeDiv = window.document.createElement('div');
|
||||||
|
citeDiv.classList.add('hanging-indent');
|
||||||
|
citeDiv.classList.add('csl-entry');
|
||||||
|
var biblioDiv = window.document.getElementById('ref-' + cite);
|
||||||
|
if (biblioDiv) {
|
||||||
|
citeDiv.innerHTML = biblioDiv.innerHTML;
|
||||||
|
}
|
||||||
|
popup.appendChild(citeDiv);
|
||||||
|
});
|
||||||
|
return popup.innerHTML;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</div> <!-- /content -->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</body></html>
|
File diff suppressed because it is too large
Load diff
After Width: | Height: | Size: 138 KiB |
|
@ -236,5 +236,61 @@
|
||||||
"title": "Reflex Testing using Machine Learning in the Clinical Laboratory",
|
"title": "Reflex Testing using Machine Learning in the Clinical Laboratory",
|
||||||
"section": "",
|
"section": "",
|
||||||
"text": "Full Paper\nTo view the full paper please go to the following link\n\n\nAbstract\nIntroduction: This research study focuses on developing and testing a machine learning algorithm to predict the FT4 result or diagnose hyper or hypothyroidism in clinical chemistry. The goal is to bridge the gap between hard-coded reflex testing and fully manual reflective testing using machine learning algorithms. The significance of this study lies in the increasing healthcare costs, where laboratory services contribute significantly to medical decisions and budgets. By implementing automated reflex testing with machine learning algorithms, unnecessary laboratory tests can be reduced, resulting in cost savings and improved efficiency in the healthcare system.\nMethods: The study was performed using the Medical Information Mart for Intensive Care (MIMIC) database for data collection. The database consists of de-identified health-related data from critical care units. Eighteen variables, including patient demographics and lab values, were selected for the study. The data set was filtered based on specific criteria, and an outcome variable was created to determine if the Free T4 value was diagnostic. The data handling and modeling were performed using R and R Studio. Regression and classification models were screened using a random grid search to tune hyperparameters, and random forest models were selected as the final models based on their performance. The selected hyperparameters for both regression and classification models are specified.\nResults: The study analyzed a dataset of 11,340 observations, randomly splitting it into a training set (9071 observations) and a testing set (2269 observations) based on the Free T4 laboratory diagnostic value stratification. Classification algorithms were used to predict whether Free T4 would be diagnostic, achieving an accuracy of 0.796 and an AUC of 0.918. The model had a sensitivity of 0.632 and a specificity of 0.892. The importance of individual analytes was assessed, with TSH being the most influential variable. The study also evaluated the predictability of Free T4 results using regression, achieving a Root Mean Square Error (RMSE) of 0.334. The predicted results had an accuracy of 0.790, similar to the classification model.\nDiscussion: The study found that the diagnostic value of Free T4 can be accurately predicted 80% of the time using machine learning algorithms. However, the model had limitations in terms of sensitivity, with a false negative rate of 16% for elevated TSH results and 20% for decreased TSH results. The model achieved a specificity of 89% but did not meet the threshold for clinical deployment. The importance of individual analytes was explored, revealing unexpected correlations between TSH and hematology results, which could be valuable for future algorithms. Real-world applications could use predictive models in clinical decision-making systems to determine the need for Free T4 lab tests based on predictions and patient signs and symptoms. However, implementing such algorithms in existing laboratory information systems poses challenges.\n\n\n\n\nReuseCC BY 4.0CitationBibTeX citation:@online{belanger2023,\n author = {Belanger, Kyle},\n title = {Reflex {Testing} Using {Machine} {Learning} in the {Clinical}\n {Laboratory}},\n date = {2023-10-12},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2023. “Reflex Testing Using Machine Learning in\nthe Clinical Laboratory.” October 12, 2023."
|
"text": "Full Paper\nTo view the full paper please go to the following link\n\n\nAbstract\nIntroduction: This research study focuses on developing and testing a machine learning algorithm to predict the FT4 result or diagnose hyper or hypothyroidism in clinical chemistry. The goal is to bridge the gap between hard-coded reflex testing and fully manual reflective testing using machine learning algorithms. The significance of this study lies in the increasing healthcare costs, where laboratory services contribute significantly to medical decisions and budgets. By implementing automated reflex testing with machine learning algorithms, unnecessary laboratory tests can be reduced, resulting in cost savings and improved efficiency in the healthcare system.\nMethods: The study was performed using the Medical Information Mart for Intensive Care (MIMIC) database for data collection. The database consists of de-identified health-related data from critical care units. Eighteen variables, including patient demographics and lab values, were selected for the study. The data set was filtered based on specific criteria, and an outcome variable was created to determine if the Free T4 value was diagnostic. The data handling and modeling were performed using R and R Studio. Regression and classification models were screened using a random grid search to tune hyperparameters, and random forest models were selected as the final models based on their performance. The selected hyperparameters for both regression and classification models are specified.\nResults: The study analyzed a dataset of 11,340 observations, randomly splitting it into a training set (9071 observations) and a testing set (2269 observations) based on the Free T4 laboratory diagnostic value stratification. Classification algorithms were used to predict whether Free T4 would be diagnostic, achieving an accuracy of 0.796 and an AUC of 0.918. The model had a sensitivity of 0.632 and a specificity of 0.892. The importance of individual analytes was assessed, with TSH being the most influential variable. The study also evaluated the predictability of Free T4 results using regression, achieving a Root Mean Square Error (RMSE) of 0.334. The predicted results had an accuracy of 0.790, similar to the classification model.\nDiscussion: The study found that the diagnostic value of Free T4 can be accurately predicted 80% of the time using machine learning algorithms. However, the model had limitations in terms of sensitivity, with a false negative rate of 16% for elevated TSH results and 20% for decreased TSH results. The model achieved a specificity of 89% but did not meet the threshold for clinical deployment. The importance of individual analytes was explored, revealing unexpected correlations between TSH and hematology results, which could be valuable for future algorithms. Real-world applications could use predictive models in clinical decision-making systems to determine the need for Free T4 lab tests based on predictions and patient signs and symptoms. However, implementing such algorithms in existing laboratory information systems poses challenges.\n\n\n\n\nReuseCC BY 4.0CitationBibTeX citation:@online{belanger2023,\n author = {Belanger, Kyle},\n title = {Reflex {Testing} Using {Machine} {Learning} in the {Clinical}\n {Laboratory}},\n date = {2023-10-12},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2023. “Reflex Testing Using Machine Learning in\nthe Clinical Laboratory.” October 12, 2023."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "",
|
||||||
|
"text": "Recently two things happened quite close together that started me on the journey to this post.\nSo these two things lead me to this, pulling Amtrak data from the web using Julia. I do not claim to be an expert on Julia but I am learning and I wanted to share my journey, nor to I claim to be an expert at Web Scraping. Taking those things in account lets follow along."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#load-packages",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#load-packages",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Load Packages",
|
||||||
|
"text": "Load Packages\nFirst off I will load the Julia packages I am going to use. The first three all have to do with web scraping, and getting the data off the website. CairoMakie will be used to make the plot. All of the rest are for data wrangling. I already have all of these packages in this project environment so I just need to let the Julia REPL know to load them. If you are brand new to Julia this site really helped explain the idea of project environments to me. I also use VSCode along with the Julia extension which does a great job of handling the project environment.\n\nusing HTTP\nusing Gumbo\nusing Cascadia\nusing DataFrames\nusing DataFramesMeta\nusing Dates\nusing Statistics\nusing CategoricalArrays\nusing CairoMakie"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#setting-up-the-web-scraping",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#setting-up-the-web-scraping",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Setting up the Web Scraping",
|
||||||
|
"text": "Setting up the Web Scraping\nNow that the packages are loaded, we can start setting up the web scraping. From my internet searching I found that Amtrak does have an API but it is quite challenging to use. I found this website Amtrak Status which does a great job of showing the data I was looking for. In this example I am just going to pull data for two trains, train 97 and train 98. You can see in the link I set those as the train numbers, and if you follow the link you will see it sets it up in a nice table to view the historical data. When then use the HTTP package to get the raw website data and then use Gumbo to parse the HTML into a table. The Cascadia package gives the various CSS selectors to help pull the info I want of the entire page. The page table does not have an ids but it is also the only table on the page. I was able to use the CSS Selector “tr” to get each row of the table into a vector. If we examine the third item in the rows vector we see that it has the information we want (the first two rows are headers for the table)\n\n\nurl = \"https://juckins.net/amtrak_status/archive/html/history.php?train_num=97%2C98&station=&date_start=07%2F01%2F2024&date_end=07%2F31%2F2024\";\nresp = HTTP.get(url);\npage = parsehtml(String(resp.body));\n\nrows = eachmatch(sel\"tr\",page.root);\n\nrows[3]"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#creating-the-dataframe",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#creating-the-dataframe",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Creating the DataFrame",
|
||||||
|
"text": "Creating the DataFrame\nNow that each row of the table is stored in a vector we need to rebuild the table into a dataframe in Julia. First I am intializing an empty dataframe by creating each column that will hold data. The column names match those of the header in the table on the website. Then I loop through each item in the rows vector. The text variable is a vector of all the td elements in the row. If the text vector is not empty and has more than one item in it, then we loop through the items and push the text into the row_data vector. Finally we push the row_data vector into the dataframe created prior to the loop. By having the nested if I can remove the footer column at the end of the table from the website. The website table header uses a different CSS selector than the rest of the table but the footer does not. At the end of the loop I now have the same table that is on the website but stored as a dataframe in Julia.\n\n# create empty DataFrame and then populate it with the table from website\ndf = DataFrame(train = String[], origin_date = [], station = String[], sch_dp = [], act_dp = String[], comments = [], s_disrupt = [], cancellations = [])\n\nfor i in rows\n text = eachmatch(Selector(\"td\"), i)\n row_data = []\n if !isempty(text) && length(text) > 1\n for item in text\n push!(row_data, nodeText(item))\n end\n push!(df, row_data)\n end\nend"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#cleaning-the-dataframe",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#cleaning-the-dataframe",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Cleaning the DataFrame",
|
||||||
|
"text": "Cleaning the DataFrame\nComing from R I am quite familiar with data cleaning using dpylr and the rest of the tidyverse packages. When looking at options I really liked what the DataFramesMeta package brings, so I have used that here to get the data were I want it. I first filter out any trains that have a service disruption as well as any that are blank in the departure column. Next I select only the station, train, and the comments column. I originally tried using the two departure columns but was having an issue with trains that arrived at the stations on day but then left the next. These were causing the delay to be quite large as it was calculating as if it actually left before arriving. The comments column has what I needed I just had to pull the string out and convert it to a numeric. After selecting the columns I first create the delay column. This pulled the comment string out of the comment column only if it contains Dp: as this indicates how late or early the train left. Next I am pulling out the time in minutes and hours from the delay string and converting those numbers to integers. The total delay column adds the minutes and hours together and if the word late is not in the column it will convert the number to negative. A negative delay in this case means the train left early. Finally I transform the columns to categorical so that they are easier to work with in the future. You can notice that for the last transformation I could not figure out how to select two columns using the transform macro. Also for those coming from R note the .=> this is the broadcast operator and it lets Julia know to perform the action on the entire vector (I think I am explaining this right!) I end the block by showing the first 5 rows of the modified dataframe.\n\n\nmod_df = @chain df begin\n @rsubset :act_dp != \"\" && :s_disrupt != \"SD\"\n @select :train :station :comments\n #can't perform match if there is nothing there\n @rtransform :delay = occursin(r\"Dp:\", :comments) ? match(r\"Dp:.*\", :comments).match : \"\"\n @rtransform :min = occursin(r\"min\", :delay) ? parse(Int,match(r\"([0-9]*) min\", :delay)[1]) : Int(0)\n @rtransform :hour = occursin(r\"hr\", :delay) ? parse(Int,match(r\"([0-9]*) hr\", :delay)[1]) *60 : Int(0)\n @rtransform :total_delay_mins = :min + :hour |> x -> occursin(r\"late\", :delay) ? x : x *-1 #if word late does not appear, train left early\n transform([:station, :train] .=> categorical, renamecols = false)\nend\n\nfirst(mod_df, 5)\n\n5×7 DataFrame\n\n\n\nRow\ntrain\nstation\ncomments\ndelay\nmin\nhour\ntotal_delay_mins\n\n\n\nCat…\nCat…\nAny\nAbstract…\nInt64\nInt64\nInt64\n\n\n\n\n1\n97\nRMT\nDp: 1 min late.\nDp: 1 min late.\n1\n0\n1\n\n\n2\n98\nFLO\nAr: 7 min early. | Dp: On time.\nDp: On time.\n0\n0\n0\n\n\n3\n98\nKTR\nDp: 12 min late.\nDp: 12 min late.\n12\n0\n12\n\n\n4\n97\nPTB\nDp: 6 min late.\nDp: 6 min late.\n6\n0\n6\n\n\n5\n97\nRVR\nAr: 8 min late. | Dp: 5 min late.\nDp: 5 min late.\n5\n0\n5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#grouping-and-summarizing",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#grouping-and-summarizing",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Grouping and Summarizing",
|
||||||
|
"text": "Grouping and Summarizing\nNow that I have the data I want, I want to group and summarize to create some graphs. Again using DataFramesMeta and the by keyword I can group by the train and station columns and then create the mean, median, max, and min columns. This action felt very to summarize in dplyr. DataFramesMeta does allow you to do the grouping and combining as two separate steps, but the by keyword combines in into one step. I then ordered by the station column and then by the train column. I then created a column that shows the difference in the mean delay between the two trains. I didn’t end up using this for now but I might make something with it later. Last I created two columns that contain the level code for the station and train columns. I will talk about the reason for this in the next section. The function levelcode is from the CategoricalArrays package and it creates an integer column that matches the level of the categorical name. Last I display the first 5 rows of the dataframe.\n\ngd = @chain mod_df begin\n @by _ [:train,:station] begin\n :mean = Float32[Statistics.mean(:total_delay_mins)]\n :median = Statistics.median(:total_delay_mins)\n :max = maximum(:total_delay_mins)\n :min = minimum(:total_delay_mins) \n end \n @orderby :station :train\n @groupby :station\n @transform :diff = [missing; diff(:mean)]\n @rtransform _ begin\n :station_code = levelcode(:station)\n :train_code = levelcode(:train)\n end\nend\n\nfirst(gd, 5)\n\n5×9 DataFrame\n\n\n\nRow\ntrain\nstation\nmean\nmedian\nmax\nmin\ndiff\nstation_code\ntrain_code\n\n\n\nCat…\nCat…\nFloat32\nFloat64\nInt64\nInt64\nFloat32?\nInt64\nInt64\n\n\n\n\n1\n97\nALX\n70.4\n50.0\n287\n0\nmissing\n1\n1\n\n\n2\n98\nALX\n101.387\n77.0\n399\n-16\n30.9871\n1\n2\n\n\n3\n97\nBAL\n53.3333\n27.0\n267\n3\nmissing\n2\n1\n\n\n4\n98\nBAL\n120.226\n104.0\n414\n0\n66.8925\n2\n2\n\n\n5\n97\nCHS\n71.1\n53.0\n286\n0\nmissing\n3\n1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#plotting",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#plotting",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Plotting",
|
||||||
|
"text": "Plotting\nComing from R and the ggplot package (also having played around a bit in Plotly for R) there was a rather step learning curve to Makie! I do feel there is a ton of flexibility in Makie, but learning to use it is a beast, and was probably the hardest part of this whole thing. The first challenge was Makie does not like categorical variables (at least for barplots, don’t know if this is always true), thus the need for using the level codes so I could pass a numerical vector to the x axis. I am then able to label that axis with the categorical labels. Makie does also allow you to just call the barplot function without all the other set up, and it will automatically create the figure and axis, however I wanted to do it manually and really build up the graph. First step was setting a color gradient, I used Dark2 from the ColorBrewer schemes, just as a personal preference for one I really like. Next up I create the figure. Directly from the Makie docs, The Figure is the outermost container object. I could pass some arguments to the Figure constructor, and change size or colors, but for this one I just left everything as the defaults. Next up is creating the axis. I placed it at position 1,1 within the previously created figure. I also pass labels for the x and y axis, a title, and then the labels for the xticks. The label roation is in radian so pi/2 rotates the labels 90 degrees. Next I generate the barplot. Not the ! in the function call allows for plotting on an existing axis. (More info on the Bang Operator) Last I set up Labels and Colors for the Legend, and the place the Legend at position 1,2 of the existing figure.\n\ncolors = cgrad(:Dark2_6)\nf = Figure();\nax = Axis(f[1,1], xlabel = \"Station\", ylabel = \"Mean Delay (mins)\", title = \"Mean Delay by Station\", xticks = (1:length(levels(gd.station_code)), levels(gd.station)), xticklabelrotation = pi/2)\nbarplot!(ax, gd.station_code, gd.mean, dodge = gd.train_code, color = colors[gd.train_code]) \n\nlabels = [\"$i\" for i in unique(gd.train)]\nelements = [PolyElement(polycolor = colors[i]) for i in unique(gd.train_code)]\n\nLegend(f[1,2],elements, labels, \"Train Number\")\n\n\nf"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"objectID": "posts/2024-08-09-learning-Julia/index.html#conclusion",
|
||||||
|
"href": "posts/2024-08-09-learning-Julia/index.html#conclusion",
|
||||||
|
"title": "Learning Julia by WebScraping Amtrak Data",
|
||||||
|
"section": "Conclusion",
|
||||||
|
"text": "Conclusion\nThere is still a lot that could be done with this data set, and I am interested to keep playing around with it to see what kind of insights I could gather. Overall I learned a lot about Julia but as I learned with R there is always more to learn! I look forward to see where this journey takes me."
|
||||||
}
|
}
|
||||||
]
|
]
|
4
_site/site_libs/bootstrap/bootstrap.min.css
vendored
4
_site/site_libs/bootstrap/bootstrap.min.css
vendored
File diff suppressed because one or more lines are too long
|
@ -85,6 +85,7 @@ code span.st {
|
||||||
|
|
||||||
code span.cf {
|
code span.cf {
|
||||||
color: #003B4F;
|
color: #003B4F;
|
||||||
|
font-weight: bold;
|
||||||
font-style: inherit;
|
font-style: inherit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,6 +194,7 @@ code span.dv {
|
||||||
|
|
||||||
code span.kw {
|
code span.kw {
|
||||||
color: #003B4F;
|
color: #003B4F;
|
||||||
|
font-weight: bold;
|
||||||
font-style: inherit;
|
font-style: inherit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
if (link.href.indexOf("#") !== -1) {
|
if (link.href.indexOf("#") !== -1) {
|
||||||
const anchor = link.href.split("#")[1];
|
const anchor = link.href.split("#")[1];
|
||||||
const heading = window.document.querySelector(
|
const heading = window.document.querySelector(
|
||||||
`[data-anchor-id=${anchor}]`
|
`[data-anchor-id="${anchor}"]`
|
||||||
);
|
);
|
||||||
if (heading) {
|
if (heading) {
|
||||||
// Add the class
|
// Add the class
|
||||||
|
@ -134,8 +134,10 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
window.innerHeight + window.pageYOffset >=
|
window.innerHeight + window.pageYOffset >=
|
||||||
window.document.body.offsetHeight
|
window.document.body.offsetHeight
|
||||||
) {
|
) {
|
||||||
|
// This is the no-scroll case where last section should be the active one
|
||||||
sectionIndex = 0;
|
sectionIndex = 0;
|
||||||
} else {
|
} else {
|
||||||
|
// This finds the last section visible on screen that should be made active
|
||||||
sectionIndex = [...sections].reverse().findIndex((section) => {
|
sectionIndex = [...sections].reverse().findIndex((section) => {
|
||||||
if (section) {
|
if (section) {
|
||||||
return window.pageYOffset >= section.offsetTop - sectionMargin;
|
return window.pageYOffset >= section.offsetTop - sectionMargin;
|
||||||
|
@ -317,6 +319,7 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
for (const child of el.children) {
|
for (const child of el.children) {
|
||||||
child.style.opacity = 0;
|
child.style.opacity = 0;
|
||||||
child.style.overflow = "hidden";
|
child.style.overflow = "hidden";
|
||||||
|
child.style.pointerEvents = "none";
|
||||||
}
|
}
|
||||||
|
|
||||||
nexttick(() => {
|
nexttick(() => {
|
||||||
|
@ -358,6 +361,7 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
|
|
||||||
const clone = child.cloneNode(true);
|
const clone = child.cloneNode(true);
|
||||||
clone.style.opacity = 1;
|
clone.style.opacity = 1;
|
||||||
|
clone.style.pointerEvents = null;
|
||||||
clone.style.display = null;
|
clone.style.display = null;
|
||||||
toggleContents.append(clone);
|
toggleContents.append(clone);
|
||||||
}
|
}
|
||||||
|
@ -432,6 +436,7 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
for (const child of el.children) {
|
for (const child of el.children) {
|
||||||
child.style.opacity = 1;
|
child.style.opacity = 1;
|
||||||
child.style.overflow = null;
|
child.style.overflow = null;
|
||||||
|
child.style.pointerEvents = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const placeholderEl = window.document.getElementById(
|
const placeholderEl = window.document.getElementById(
|
||||||
|
@ -739,6 +744,7 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
// Process the collapse state if this is an UL
|
// Process the collapse state if this is an UL
|
||||||
if (el.tagName === "UL") {
|
if (el.tagName === "UL") {
|
||||||
if (tocOpenDepth === -1 && depth > 1) {
|
if (tocOpenDepth === -1 && depth > 1) {
|
||||||
|
// toc-expand: false
|
||||||
el.classList.add("collapse");
|
el.classList.add("collapse");
|
||||||
} else if (
|
} else if (
|
||||||
depth <= tocOpenDepth ||
|
depth <= tocOpenDepth ||
|
||||||
|
@ -757,10 +763,9 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
};
|
};
|
||||||
|
|
||||||
// walk the TOC and expand / collapse any items that should be shown
|
// walk the TOC and expand / collapse any items that should be shown
|
||||||
|
|
||||||
if (tocEl) {
|
if (tocEl) {
|
||||||
walk(tocEl, 0);
|
|
||||||
updateActiveLink();
|
updateActiveLink();
|
||||||
|
walk(tocEl, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Throttle the scroll event and walk peridiocally
|
// Throttle the scroll event and walk peridiocally
|
||||||
|
@ -779,6 +784,10 @@ window.document.addEventListener("DOMContentLoaded", function (_event) {
|
||||||
window.addEventListener(
|
window.addEventListener(
|
||||||
"resize",
|
"resize",
|
||||||
throttle(() => {
|
throttle(() => {
|
||||||
|
if (tocEl) {
|
||||||
|
updateActiveLink();
|
||||||
|
walk(tocEl, 0);
|
||||||
|
}
|
||||||
if (!isReaderMode()) {
|
if (!isReaderMode()) {
|
||||||
hideOverlappedSidebars();
|
hideOverlappedSidebars();
|
||||||
}
|
}
|
||||||
|
|
2
_site/site_libs/quarto-listing/list.min.js
vendored
2
_site/site_libs/quarto-listing/list.min.js
vendored
File diff suppressed because one or more lines are too long
|
@ -5,9 +5,45 @@ const headroomChanged = new CustomEvent("quarto-hrChanged", {
|
||||||
composed: false,
|
composed: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const announceDismiss = () => {
|
||||||
|
const annEl = window.document.getElementById("quarto-announcement");
|
||||||
|
if (annEl) {
|
||||||
|
annEl.remove();
|
||||||
|
|
||||||
|
const annId = annEl.getAttribute("data-announcement-id");
|
||||||
|
window.localStorage.setItem(`quarto-announce-${annId}`, "true");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const announceRegister = () => {
|
||||||
|
const annEl = window.document.getElementById("quarto-announcement");
|
||||||
|
if (annEl) {
|
||||||
|
const annId = annEl.getAttribute("data-announcement-id");
|
||||||
|
const isDismissed =
|
||||||
|
window.localStorage.getItem(`quarto-announce-${annId}`) || false;
|
||||||
|
if (isDismissed) {
|
||||||
|
announceDismiss();
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
annEl.classList.remove("hidden");
|
||||||
|
}
|
||||||
|
|
||||||
|
const actionEl = annEl.querySelector(".quarto-announcement-action");
|
||||||
|
if (actionEl) {
|
||||||
|
actionEl.addEventListener("click", function (e) {
|
||||||
|
e.preventDefault();
|
||||||
|
// Hide the bar immediately
|
||||||
|
announceDismiss();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
window.document.addEventListener("DOMContentLoaded", function () {
|
window.document.addEventListener("DOMContentLoaded", function () {
|
||||||
let init = false;
|
let init = false;
|
||||||
|
|
||||||
|
announceRegister();
|
||||||
|
|
||||||
// Manage the back to top button, if one is present.
|
// Manage the back to top button, if one is present.
|
||||||
let lastScrollTop = window.pageYOffset || document.documentElement.scrollTop;
|
let lastScrollTop = window.pageYOffset || document.documentElement.scrollTop;
|
||||||
const scrollDownBuffer = 5;
|
const scrollDownBuffer = 5;
|
||||||
|
|
|
@ -1275,7 +1275,11 @@ async function fuseSearch(query, fuse, fuseOptions) {
|
||||||
|
|
||||||
// If we don't have a subfuse and the query is long enough, go ahead
|
// If we don't have a subfuse and the query is long enough, go ahead
|
||||||
// and create a subfuse to use for subsequent queries
|
// and create a subfuse to use for subsequent queries
|
||||||
if (now - then > kFuseMaxWait && subSearchFuse === undefined) {
|
if (
|
||||||
|
now - then > kFuseMaxWait &&
|
||||||
|
subSearchFuse === undefined &&
|
||||||
|
resultsRaw.length < fuseOptions.limit
|
||||||
|
) {
|
||||||
subSearchTerm = query;
|
subSearchTerm = query;
|
||||||
subSearchFuse = new window.Fuse([], kFuseIndexOptions);
|
subSearchFuse = new window.Fuse([], kFuseIndexOptions);
|
||||||
resultsRaw.forEach((rr) => {
|
resultsRaw.forEach((rr) => {
|
||||||
|
|
142
posts/2024-08-09-learning-Julia/index.qmd
Normal file
142
posts/2024-08-09-learning-Julia/index.qmd
Normal file
|
@ -0,0 +1,142 @@
|
||||||
|
---
|
||||||
|
title: "Learning Julia by WebScraping Amtrak Data"
|
||||||
|
# description: ""
|
||||||
|
date: "08/09/2024" #Update when live
|
||||||
|
draft: true
|
||||||
|
categories:
|
||||||
|
- Julia
|
||||||
|
- dataViz
|
||||||
|
engine: julia
|
||||||
|
---
|
||||||
|
|
||||||
|
Recently two things happened quite close together that started me on the journey to this post.
|
||||||
|
|
||||||
|
1. First I have been planning on fiddling around and learning Julia for a while. I love R and that love will not change but I thought it was good to try something different.
|
||||||
|
2. My mom took a train and it was super late! I started looking at the station and it seemed like it was always late.
|
||||||
|
|
||||||
|
So these two things lead me to this, pulling Amtrak data from the web using Julia. I do not claim to be an expert on Julia but I am learning and I wanted to share my journey, nor to I claim to be an expert at Web Scraping. Taking those things in account lets follow along.
|
||||||
|
|
||||||
|
## Load Packages
|
||||||
|
|
||||||
|
First off I will load the Julia packages I am going to use. The first three all have to do with web scraping, and getting the data off the website. CairoMakie will be used to make the plot. All of the rest are for data wrangling. I already have all of these packages in this project environment so I just need to let the Julia REPL know to load them. If you are brand new to Julia this [site](https://towardsdatascience.com/how-to-setup-project-environments-in-julia-ec8ae73afe9c) really helped explain the idea of project environments to me. I also use [VSCode](https://code.visualstudio.com/) along with the [Julia extension](https://marketplace.visualstudio.com/items?itemName=julialang.language-julia) which does a great job of handling the project environment.
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
using HTTP
|
||||||
|
using Gumbo
|
||||||
|
using Cascadia
|
||||||
|
using DataFrames
|
||||||
|
using DataFramesMeta
|
||||||
|
using Dates
|
||||||
|
using Statistics
|
||||||
|
using CategoricalArrays
|
||||||
|
using CairoMakie
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setting up the Web Scraping
|
||||||
|
|
||||||
|
Now that the packages are loaded, we can start setting up the web scraping. From my internet searching I found that Amtrak does have an API but it is quite challenging to use. I found this website [Amtrak Status](https://juckins.net/amtrak_status/archive/html/home.php) which does a great job of showing the data I was looking for. In this example I am just going to pull data for two trains, train 97 and train 98. You can see in the link I set those as the train numbers, and if you follow the link you will see it sets it up in a nice table to view the historical data. When then use the HTTP package to get the raw website data and then use Gumbo to parse the HTML into a table. The Cascadia package gives the various CSS selectors to help pull the info I want of the entire page. The page table does not have an ids but it is also the only table on the page. I was able to use the CSS Selector "tr" to get each row of the table into a vector. If we examine the third item in the rows vector we see that it has the information we want (the first two rows are headers for the table)
|
||||||
|
|
||||||
|
<!-- cspell: disable -->
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
#| output: false
|
||||||
|
|
||||||
|
url = "https://juckins.net/amtrak_status/archive/html/history.php?train_num=97%2C98&station=&date_start=07%2F01%2F2024&date_end=07%2F31%2F2024";
|
||||||
|
resp = HTTP.get(url);
|
||||||
|
page = parsehtml(String(resp.body));
|
||||||
|
|
||||||
|
rows = eachmatch(sel"tr",page.root);
|
||||||
|
|
||||||
|
rows[3]
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- cspell: enable -->
|
||||||
|
|
||||||
|
## Creating the DataFrame
|
||||||
|
|
||||||
|
Now that each row of the table is stored in a vector we need to rebuild the table into a dataframe in Julia. First I am intializing an empty dataframe by creating each column that will hold data. The column names match those of the header in the table on the website. Then I loop through each item in the rows vector. The text variable is a vector of all the td elements in the row. If the text vector is not empty and has more than one item in it, then we loop through the items and push the text into the row_data vector. Finally we push the row_data vector into the dataframe created prior to the loop. By having the nested if I can remove the footer column at the end of the table from the website. The website table header uses a different CSS selector than the rest of the table but the footer does not. At the end of the loop I now have the same table that is on the website but stored as a dataframe in Julia.
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
# create empty DataFrame and then populate it with the table from website
|
||||||
|
df = DataFrame(train = String[], origin_date = [], station = String[], sch_dp = [], act_dp = String[], comments = [], s_disrupt = [], cancellations = [])
|
||||||
|
|
||||||
|
for i in rows
|
||||||
|
text = eachmatch(Selector("td"), i)
|
||||||
|
row_data = []
|
||||||
|
if !isempty(text) && length(text) > 1
|
||||||
|
for item in text
|
||||||
|
push!(row_data, nodeText(item))
|
||||||
|
end
|
||||||
|
push!(df, row_data)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cleaning the DataFrame
|
||||||
|
|
||||||
|
Coming from R I am quite familiar with data cleaning using dpylr and the rest of the tidyverse packages. When looking at options I really liked what the DataFramesMeta package brings, so I have used that here to get the data were I want it. I first filter out any trains that have a service disruption as well as any that are blank in the departure column. Next I select only the station, train, and the comments column. I originally tried using the two departure columns but was having an issue with trains that arrived at the stations on day but then left the next. These were causing the delay to be quite large as it was calculating as if it actually left before arriving. The comments column has what I needed I just had to pull the string out and convert it to a numeric. After selecting the columns I first create the delay column. This pulled the comment string out of the comment column only if it contains Dp: as this indicates how late or early the train left. Next I am pulling out the time in minutes and hours from the delay string and converting those numbers to integers. The total delay column adds the minutes and hours together and if the word late is not in the column it will convert the number to negative. A negative delay in this case means the train left early. Finally I transform the columns to categorical so that they are easier to work with in the future. You can notice that for the last transformation I could not figure out how to select two columns using the transform macro. Also for those coming from R note the .=> this is the broadcast operator and it lets Julia know to perform the action on the entire vector (I think I am explaining this right!) I end the block by showing the first 5 rows of the modified dataframe.
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
|
||||||
|
mod_df = @chain df begin
|
||||||
|
@rsubset :act_dp != "" && :s_disrupt != "SD"
|
||||||
|
@select :train :station :comments
|
||||||
|
#can't perform match if there is nothing there
|
||||||
|
@rtransform :delay = occursin(r"Dp:", :comments) ? match(r"Dp:.*", :comments).match : ""
|
||||||
|
@rtransform :min = occursin(r"min", :delay) ? parse(Int,match(r"([0-9]*) min", :delay)[1]) : Int(0)
|
||||||
|
@rtransform :hour = occursin(r"hr", :delay) ? parse(Int,match(r"([0-9]*) hr", :delay)[1]) *60 : Int(0)
|
||||||
|
@rtransform :total_delay_mins = :min + :hour |> x -> occursin(r"late", :delay) ? x : x *-1 #if word late does not appear, train left early
|
||||||
|
transform([:station, :train] .=> categorical, renamecols = false)
|
||||||
|
end
|
||||||
|
|
||||||
|
first(mod_df, 5)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Grouping and Summarizing
|
||||||
|
|
||||||
|
Now that I have the data I want, I want to group and summarize to create some graphs. Again using DataFramesMeta and the by keyword I can group by the train and station columns and then create the mean, median, max, and min columns. This action felt very to summarize in dplyr. DataFramesMeta does allow you to do the grouping and combining as two separate steps, but the by keyword combines in into one step. I then ordered by the station column and then by the train column. I then created a column that shows the difference in the mean delay between the two trains. I didn't end up using this for now but I might make something with it later. Last I created two columns that contain the level code for the station and train columns. I will talk about the reason for this in the next section. The function levelcode is from the CategoricalArrays package and it creates an integer column that matches the level of the categorical name. Last I display the first 5 rows of the dataframe.
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
gd = @chain mod_df begin
|
||||||
|
@by _ [:train,:station] begin
|
||||||
|
:mean = Float32[Statistics.mean(:total_delay_mins)]
|
||||||
|
:median = Statistics.median(:total_delay_mins)
|
||||||
|
:max = maximum(:total_delay_mins)
|
||||||
|
:min = minimum(:total_delay_mins)
|
||||||
|
end
|
||||||
|
@orderby :station :train
|
||||||
|
@groupby :station
|
||||||
|
@transform :diff = [missing; diff(:mean)]
|
||||||
|
@rtransform _ begin
|
||||||
|
:station_code = levelcode(:station)
|
||||||
|
:train_code = levelcode(:train)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
first(gd, 5)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Plotting
|
||||||
|
Coming from R and the ggplot package (also having played around a bit in Plotly for R) there was a rather step learning curve to Makie! I do feel there is a ton of flexibility in Makie, but learning to use it is a beast, and was probably the hardest part of this whole thing. The first challenge was Makie does not like categorical variables (at least for barplots, don't know if this is always true), thus the need for using the level codes so I could pass a numerical vector to the x axis. I am then able to label that axis with the categorical labels. Makie does also allow you to just call the barplot function without all the other set up, and it will automatically create the figure and axis, however I wanted to do it manually and really build up the graph.
|
||||||
|
First step was setting a color gradient, I used Dark2 from the ColorBrewer schemes, just as a personal preference for one I really like. Next up I create the figure. Directly from the Makie docs, The Figure is the outermost container object. I could pass some arguments to the Figure constructor, and change size or colors, but for this one I just left everything as the defaults. Next up is creating the axis. I placed it at position 1,1 within the previously created figure. I also pass labels for the x and y axis, a title, and then the labels for the xticks. The label roation is in radian so pi/2 rotates the labels 90 degrees. Next I generate the barplot. Not the ! in the function call allows for plotting on an existing axis. ([More info on the Bang Operator](https://docs.julialang.org/en/v1/manual/style-guide/#bang-convention)) Last I set up Labels and Colors for the Legend, and the place the Legend at position 1,2 of the existing figure.
|
||||||
|
|
||||||
|
|
||||||
|
```{julia}
|
||||||
|
colors = cgrad(:Dark2_6)
|
||||||
|
f = Figure();
|
||||||
|
ax = Axis(f[1,1], xlabel = "Station", ylabel = "Mean Delay (mins)", title = "Mean Delay by Station", xticks = (1:length(levels(gd.station_code)), levels(gd.station)), xticklabelrotation = pi/2)
|
||||||
|
barplot!(ax, gd.station_code, gd.mean, dodge = gd.train_code, color = colors[gd.train_code])
|
||||||
|
|
||||||
|
labels = ["$i" for i in unique(gd.train)]
|
||||||
|
elements = [PolyElement(polycolor = colors[i]) for i in unique(gd.train_code)]
|
||||||
|
|
||||||
|
Legend(f[1,2],elements, labels, "Train Number")
|
||||||
|
|
||||||
|
|
||||||
|
f
|
||||||
|
```
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
There is still a lot that could be done with this data set, and I am interested to keep playing around with it to see what kind of insights I could gather. Overall I learned a lot about Julia but as I learned with R there is always more to learn! I look forward to see where this journey takes me.
|
Loading…
Reference in a new issue