import * as React from "react"
import { Link } from "gatsby"

import Layout from "../../components/layout"
import Seo from "../../components/seo"

import { projects } from "../../components/data/projects"
import Index from "../../components/index"

import { StaticImage } from "gatsby-plugin-image"

import ProjectHeader from "../../components/project-header/project-header"
import ReadNext from "../../components/read-next/read-next"
import { RoughNotation } from "react-rough-notation"

const chapters = [
  {
    name: `Overview`,
    scrollTo: 'overview',
  },
  {
    name: 'Brainstorming',
    scrollTo: 'brainstorming',
  },
  {
    name: 'Development',
    scrollTo: 'development',
  },
  {
    name: 'Demo',
    scrollTo: 'demo',
  },
];

const theme = projects['signLanguageTranslation'].themeColor;

const ProjectPage = () => (
  <>
    <Seo title="Helping Hands" />
    <ProjectHeader projectObj={projects['signLanguageTranslation']} />
    <section className={`fix-top fix-bottom`} id={`overview`}>
      <Index chapters={chapters} />
      <div className={`content--md-center`}>
        <p className={`primary lh`}>As my minor technical project at my undergraduate university, I decided to work on American Sign Language Translation. I had always wanted to implement this idea because I believed the technology to solve this problem exists and could actually create an impact in some people's daily life.</p>
      </div>
    </section>
    {/* <div className={`content--md-center`}><hr /></div> */}
    <section className={`fix-top fix-bottom`} id={`brainstorming`}>
      <div className={`content--md-center`}>
      <h2 className={`add-margin-bottom`}>Brainstorming</h2>
        <p className={`primary add-margin-top`}>Most existing robust translators require additional setup — hand-wearable devices (gloves, rings) or specialized cameras such as depth-sensing cameras (used by Leap Motion) or kinect. However, if we want a solution that can be used anytime and anywhere, we cannot expect people to carry a glove or dedicated hardware for this purpose. Hence, we wanted to implement an algorithm/approach that could work with an ordinary mobile phone camera or webcam.</p>
        <div className={`add-margin-top`}>
            <h6 className={`center`} style={{ color: theme}}>Problem</h6>
            <h4 className={`center`}>How to design an <RoughNotation type={`circle`} show color={theme}>ideal</RoughNotation> sign language translator that can fulfill the role of a human interpreting sign real-time?</h4>
          </div>
          <p className={`primary lh add-margin-top`}>An ideal translator should be equivalent to a trained individual interpreting sign real-time looking at the person. For this, it needs to:</p>
          <ul>
            <li>Identify hand signal real time as the signal is made and held</li>
            <li>Use the sequence of classified signals to frame the intended sentence</li>
            <li>Speak the interpreted sentence once the actions are completed</li>
            <li>Filter out noise by understading context</li>
            <li>Should not require specialized hardware setup to run</li>
          </ul>
      </div>
    </section>
    <section className={`fix-top fix-bottom`} id={`development`}>
      <div className={`content--md-center`}>
        <h6 style={{ color: theme}}>Development</h6>
        <h2 className={`add-margin-bottom`}>The Implementation</h2>
        <p className={`primary`}></p>
        <div className={`project-image`}>
          <StaticImage
            src={'../../images/projects/sign-language/stages.png'}
            width={1200}
            quality={95}
            formats={["AUTO", "WEBP", "AVIF"]}
            alt="Stages of the programming experience"
            loading={`eager`}
          />
        </div>
        <h4>1. Recognising the hand</h4>
        <p className={`primary lh`}>
        We spent some time trying to identify where the hand is in the full-body frame using hand recognition algorithms (Haar files trained to capture the hand) but soon realised that doing that would exponentially decrease the accuracy of identifying a sign for two reasons. First, the hand recognition was not robust enough when interfered with some background noise, and since the rest of algorithm relies on it correctly cropping the hand from the frame, there was not much scope for error. Secondly, there are signs such as the <i>letter S (represented by a simple fist)</i> and <i>D (represented by pointing a finger)</i> which closely resemble everyday signs used by people, so the translator can pick those signals from the frame even though they are not intended for communication.
        </p>
        <h4 className={`add-margin-top`}>2. Predicting the alphabets</h4>
        <p className={`primary lh`}>
        This led us to dedicate an area for signal classification in the frame (depending on the person being left-handed or right-handed). Knowing where the hand is, we could then focus on predicting accurately the alphabet that the hand represents. We trained and saved a CNN model on a dataset we created by asking different people to sign in front of different backgrounds. We iterated our model parameters and improved our dataset till we achieved over <RoughNotation type={`circle`} show multiline color={theme}><b>98% accuracy</b></RoughNotation> in identifying random test signals. We, then loaded our saved model to make predictions real-time.
        </p>
        <h4 className={`add-margin-top`}>3. Generating a sentence</h4>
        <p className={`primary lh`}>
        The next step was to sample this prediction appropriately and add it sequentially to form a sentence. The sampling had to be done carefully to avoid the noise and use the signal which remains steady for a short duration till it changes. To implement it, we created sets of 15 predictions using 15 frames (approximately one second) and saw if the predictions were consistent accross most frames. If they were consistent, the prediction was added to the sentence and if it kept varying, nothing was added. Then, we used <RoughNotation type={`underline`} show multiline color={theme}>word segmentation</RoughNotation> on the formed sentence to add appropriate spaces real-time.
        </p>
        <h4 className={`add-margin-top`}>4. Converting to speech</h4>
        <p className={`primary lh`}>
        The only challenge in speech conversion was to identify when to speak the generated sentence. It is intuitive that once the person stops signing, the translator should speak the identified sentence. We set up a timer everytime a new prediction was added to the sentence. Knowing that a sign takes approximately 1-2 seconds to be added, if there was no new predicted character in five seconds, the translator would speak the sentence.
        </p>
      </div>
    </section>
    <section className={`fix-top fix-bottom`} id={`demo`}>
      <div className={`content--md-center`}>
        <h6 style={{ color: theme}}>Demo</h6>
        <h2 className={`add-margin-bottom`}>The Working Prototype</h2>
        <iframe
          width="100%"
          height="400"
          src="https://www.youtube-nocookie.com/embed/4F3gxMd5oeE"
          frameborder="0"
          allow="accelerometer; encrypted-media; gyroscope; picture-in-picture"
          allowfullscreen
        ></iframe>
        <p className={`primary lh`}>
        Here, you can view the working prototype built with Python, OpenCV, and Tensorflow (Keras)
        </p>
      </div>
    </section>
    <ReadNext projectObj={projects['mastercard']} />
  </>
)

export default ProjectPage
