webui : handle PDF input (as text or image) + convert pasted long content to file (#13562)

* webui : handle PDF input (as text or image) * handle the case where pdf image + server without mtmd * fix bug missing pages
2025-07-23 19:25:51 +00:00 · 2025-05-15 14:24:50 +02:00
parent c753d7bed0
commit 3cc1f1f1d2
8 changed files with 425 additions and 22 deletions
--- a/tools/server/webui/src/Config.ts
+++ b/tools/server/webui/src/Config.ts
@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
  showTokensPerSecond: false,
  showThoughtInProgress: false,
  excludeThoughtOnReq: true,
+  pasteLongTextToFileLen: 2500,
+  pdfAsImage: false,
  // make sure these default values are in sync with `common.h`
  samplers: 'edkypmxt',
  temperature: 0.8,
@ -43,6 +45,8 @@ export const CONFIG_DEFAULT = {
 export const CONFIG_INFO: Record<string, string> = {
  apiKey: 'Set the API Key if you are using --api-key option for the server.',
  systemMessage: 'The starting message that defines how model should behave.',
+  pasteLongTextToFileLen:
+    'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
  samplers:
    'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
  temperature:
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@ -306,6 +306,7 @@ function ChatInput({
  onStop: () => void;
  isGenerating: boolean;
 }) {
+  const { config } = useAppContext();
  const [isDrag, setIsDrag] = useState(false);

  return (
@ -328,7 +329,28 @@ function ChatInput({
        {({ getRootProps, getInputProps }) => (
          <div
            className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
+            // when a file is pasted to the input, we handle it here
+            // if a text is pasted, and if it is long text, we will convert it to a file
            onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
+              const text = e.clipboardData.getData('text/plain');
+              if (
+                text.length > 0 &&
+                config.pasteLongTextToFileLen > 0 &&
+                text.length > config.pasteLongTextToFileLen
+              ) {
+                // if the text is too long, we will convert it to a file
+                extraContext.addItems([
+                  {
+                    type: 'context',
+                    name: 'Pasted Content',
+                    content: text,
+                  },
+                ]);
+                e.preventDefault();
+                return;
+              }
+
+              // if a file is pasted, we will handle it here
              const files = Array.from(e.clipboardData.items)
                .filter((item) => item.kind === 'file')
                .map((item) => item.getAsFile())
--- a/tools/server/webui/src/components/SettingDialog.tsx
+++ b/tools/server/webui/src/components/SettingDialog.tsx
@ -100,6 +100,16 @@ const SETTING_SECTIONS: SettingSection[] = [
            key,
          }) as SettingFieldInput
      ),
+      {
+        type: SettingInputType.SHORT_INPUT,
+        label: 'Paste length to file',
+        key: 'pasteLongTextToFileLen',
+      },
+      {
+        type: SettingInputType.CHECKBOX,
+        label: 'Parse PDF as image instead of text',
+        key: 'pdfAsImage',
+      },
    ],
  },
  {
@ -452,10 +462,10 @@ function SettingsModalLongInput({
  label?: string;
 }) {
  return (
-    <label className="form-control mb-2">
-      <div className="label inline">{label || configKey}</div>
+    <label className="form-control">
+      <div className="label inline text-sm">{label || configKey}</div>
      <textarea
-        className="textarea textarea-bordered h-24"
+        className="textarea textarea-bordered h-24 mb-2"
        placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
        value={value}
        onChange={(e) => onChange(e.target.value)}
@ -482,9 +492,7 @@ function SettingsModalShortInput({
    <>
      {/* on mobile, we simply show the help message here */}
      {helpMsg && (
-        <div className="block md:hidden mb-1">
-          <b>{label || configKey}</b>
-          <br />
+        <div className="block mb-1 opacity-75">
          <p className="text-xs">{helpMsg}</p>
        </div>
      )}
@ -493,11 +501,6 @@ function SettingsModalShortInput({
          <div tabIndex={0} role="button" className="font-bold hidden md:block">
            {label || configKey}
          </div>
-          {helpMsg && (
-            <div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
-              {helpMsg}
-            </div>
-          )}
        </div>
        <input
          type="text"
--- a/tools/server/webui/src/components/useChatExtraContext.tsx
+++ b/tools/server/webui/src/components/useChatExtraContext.tsx
@ -2,6 +2,17 @@ import { useState } from 'react';
 import { MessageExtra } from '../utils/types';
 import toast from 'react-hot-toast';
 import { useAppContext } from '../utils/app.context';
+import * as pdfjs from 'pdfjs-dist';
+import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
+import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
+
+pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
+
+// This file handles uploading extra context items (a.k.a files)
+// It allows processing these kinds of files:
+// - image files (converted to base64)
+// - text files (including code files)
+// - pdf (converted to text)

 // Interface describing the API returned by the hook
 export interface ChatExtraContextApi {
@ -13,7 +24,7 @@ export interface ChatExtraContextApi {
 }

 export function useChatExtraContext(): ChatExtraContextApi {
-  const { serverProps } = useAppContext();
+  const { serverProps, config } = useAppContext();
  const [items, setItems] = useState<MessageExtra[]>([]);

  const addItems = (newItems: MessageExtra[]) => {
@ -28,6 +39,8 @@ export function useChatExtraContext(): ChatExtraContextApi {
    setItems([]);
  };

+  const isSupportVision = serverProps?.modalities?.vision;
+
  const onFileAdded = (files: File[]) => {
    for (const file of files) {
      const mimeType = file.type;
@ -38,7 +51,7 @@ export function useChatExtraContext(): ChatExtraContextApi {
      }

      if (mimeType.startsWith('image/')) {
-        if (!serverProps?.modalities?.vision) {
+        if (!isSupportVision) {
          toast.error('Multimodal is not supported by this server or model.');
          break;
        }
@ -69,7 +82,43 @@ export function useChatExtraContext(): ChatExtraContextApi {
        toast.error('Video and audio files are not supported yet.');
        break;
      } else if (mimeType.startsWith('application/pdf')) {
-        toast.error('PDF files are not supported yet.');
+        if (config.pdfAsImage && !isSupportVision) {
+          toast(
+            'Multimodal is not supported, PDF will be converted to text instead of image.'
+          );
+          break;
+        }
+
+        const promise =
+          config.pdfAsImage && isSupportVision
+            ? convertPDFToImage(file).then((base64Urls) => {
+                addItems(
+                  base64Urls.map((base64Url) => ({
+                    type: 'imageFile',
+                    name: file.name,
+                    base64Url,
+                  }))
+                );
+              })
+            : convertPDFToText(file).then((content) => {
+                if (isSupportVision) {
+                  toast.success(
+                    'PDF file converted to text. You can also convert it to image, see in Settings.'
+                  );
+                }
+                addItems([
+                  {
+                    type: 'textFile',
+                    name: file.name,
+                    content,
+                  },
+                ]);
+              });
+
+        promise.catch((error) => {
+          console.error(error);
+          toast.error('Failed to parse PDF file.');
+        });
        break;
      } else {
        // Because there can be many text file types (like code file), we will not check the mime type
@ -105,11 +154,69 @@ export function useChatExtraContext(): ChatExtraContextApi {
  };
 }

+async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = (event) => {
+      if (event.target?.result) {
+        resolve(event.target.result as ArrayBuffer);
+      } else {
+        reject(new Error('Failed to read file.'));
+      }
+    };
+    reader.readAsArrayBuffer(file);
+  });
+}
+
+async function convertPDFToText(file: File): Promise<string> {
+  const buffer = await getFileAsBuffer(file);
+  const pdf = await pdfjs.getDocument(buffer).promise;
+  const numPages = pdf.numPages;
+  const textContentPromises: Promise<TextContent>[] = [];
+  for (let i = 1; i <= numPages; i++) {
+    textContentPromises.push(
+      pdf.getPage(i).then((page) => page.getTextContent())
+    );
+  }
+  const textContents = await Promise.all(textContentPromises);
+  const textItems = textContents.flatMap((textContent: TextContent) =>
+    textContent.items.map((item) => (item as TextItem).str ?? '')
+  );
+  return textItems.join('\n');
+}
+
+// returns list of base64 images
+async function convertPDFToImage(file: File): Promise<string[]> {
+  const buffer = await getFileAsBuffer(file);
+  const doc = await pdfjs.getDocument(buffer).promise;
+  const pages: Promise<string>[] = [];
+
+  for (let i = 1; i <= doc.numPages; i++) {
+    const page = await doc.getPage(i);
+    const viewport = page.getViewport({ scale: 1.5 });
+    const canvas = document.createElement('canvas');
+    const ctx = canvas.getContext('2d');
+    canvas.width = viewport.width;
+    canvas.height = viewport.height;
+    if (!ctx) {
+      throw new Error('Failed to get 2D context from canvas');
+    }
+    const task = page.render({ canvasContext: ctx, viewport: viewport });
+    pages.push(
+      task.promise.then(() => {
+        return canvas.toDataURL();
+      })
+    );
+  }
+
+  return await Promise.all(pages);
+}
+
 // WARN: vibe code below
 // This code is a heuristic to determine if a string is likely not binary.
 // It is necessary because input file can have various mime types which we don't have time to investigate.
 // For example, a python file can be text/plain, application/x-python, etc.
-export function isLikelyNotBinary(str: string): boolean {
+function isLikelyNotBinary(str: string): boolean {
  const options = {
    prefixLength: 1024 * 10, // Check the first 10KB of the string
    suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars